new training process and changes to the models (wip)

This commit is contained in:
Dobromir Popov 2025-04-01 18:43:26 +03:00
parent a78906a888
commit 902593b5f3
6 changed files with 5151 additions and 2635 deletions

View File

@ -28,14 +28,14 @@ class DQNAgent:
window_size: int, window_size: int,
num_features: int, num_features: int,
timeframes: List[str], timeframes: List[str],
learning_rate: float = 0.001, learning_rate: float = 0.0005, # Reduced learning rate for more stability
gamma: float = 0.99, gamma: float = 0.97, # Slightly reduced discount factor
epsilon: float = 1.0, epsilon: float = 1.0,
epsilon_min: float = 0.01, epsilon_min: float = 0.05, # Increased minimum epsilon for more exploration
epsilon_decay: float = 0.995, epsilon_decay: float = 0.9975, # Slower decay rate
memory_size: int = 10000, memory_size: int = 20000, # Increased memory size
batch_size: int = 64, batch_size: int = 128, # Larger batch size
target_update: int = 10): target_update: int = 5): # More frequent target updates
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
@ -70,23 +70,25 @@ class DQNAgent:
).to(self.device) ).to(self.device)
self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.load_state_dict(self.policy_net.state_dict())
# Initialize optimizer # Initialize optimizer with gradient clipping
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
# Initialize memory # Initialize memories with different priorities
self.memory = deque(maxlen=memory_size) self.memory = deque(maxlen=memory_size)
self.extrema_memory = deque(maxlen=memory_size // 4) # For extrema points
# Special memory for extrema samples to use for targeted learning self.positive_memory = deque(maxlen=memory_size // 4) # For positive rewards
self.extrema_memory = deque(maxlen=memory_size // 5) # Smaller size for extrema examples
# Training metrics # Training metrics
self.update_count = 0 self.update_count = 0
self.losses = [] self.losses = []
self.avg_reward = 0
self.no_improvement_count = 0
self.best_reward = float('-inf')
def remember(self, state: np.ndarray, action: int, reward: float, def remember(self, state: np.ndarray, action: int, reward: float,
next_state: np.ndarray, done: bool, is_extrema: bool = False): next_state: np.ndarray, done: bool, is_extrema: bool = False):
""" """
Store experience in memory Store experience in memory with prioritization
Args: Args:
state: Current state state: Current state
@ -97,28 +99,124 @@ class DQNAgent:
is_extrema: Whether this is a local extrema sample (for specialized learning) is_extrema: Whether this is a local extrema sample (for specialized learning)
""" """
experience = (state, action, reward, next_state, done) experience = (state, action, reward, next_state, done)
# Always add to main memory
self.memory.append(experience) self.memory.append(experience)
# If this is an extrema sample, also add to specialized memory # Add to specialized memories if applicable
if is_extrema: if is_extrema:
self.extrema_memory.append(experience) self.extrema_memory.append(experience)
def act(self, state: np.ndarray) -> int: # Store positive experiences separately for prioritized replay
"""Choose action using epsilon-greedy policy""" if reward > 0:
if random.random() < self.epsilon: self.positive_memory.append(experience)
def act(self, state: np.ndarray, explore=True) -> int:
"""Choose action using epsilon-greedy policy with explore flag"""
if explore and random.random() < self.epsilon:
return random.randrange(self.action_size) return random.randrange(self.action_size)
with torch.no_grad(): with torch.no_grad():
state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # Ensure state is normalized before inference
action_probs, extrema_pred = self.policy_net(state) state_tensor = self._normalize_state(state)
state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
action_probs, extrema_pred = self.policy_net(state_tensor)
return action_probs.argmax().item() return action_probs.argmax().item()
def replay(self, use_extrema=False) -> float: def _normalize_state(self, state: np.ndarray) -> np.ndarray:
"""Normalize the state data to prevent numerical issues"""
# Handle NaN and infinite values
state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
# Check if state is 1D array (happens in some environments)
if len(state.shape) == 1:
# If 1D, we need to normalize the whole array
normalized_state = state.copy()
# Convert any timestamp or non-numeric data to float
for i in range(len(normalized_state)):
# Check for timestamp-like objects
if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
# Convert timestamp to float (seconds since epoch)
normalized_state[i] = float(normalized_state[i].timestamp())
elif not isinstance(normalized_state[i], (int, float, np.number)):
# Set non-numeric data to 0
normalized_state[i] = 0.0
# Ensure all values are float
normalized_state = normalized_state.astype(np.float32)
# Simple min-max normalization for 1D state
state_min = np.min(normalized_state)
state_max = np.max(normalized_state)
if state_max > state_min:
normalized_state = (normalized_state - state_min) / (state_max - state_min)
return normalized_state
# Handle 2D arrays
normalized_state = np.zeros_like(state, dtype=np.float32)
# Convert any timestamp or non-numeric data to float
for i in range(state.shape[0]):
for j in range(state.shape[1]):
if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
# Convert timestamp to float (seconds since epoch)
normalized_state[i, j] = float(state[i, j].timestamp())
elif isinstance(state[i, j], (int, float, np.number)):
normalized_state[i, j] = state[i, j]
else:
# Set non-numeric data to 0
normalized_state[i, j] = 0.0
# Loop through each timeframe's features in the combined state
feature_count = state.shape[1] // len(self.timeframes)
for tf_idx in range(len(self.timeframes)):
start_idx = tf_idx * feature_count
end_idx = start_idx + feature_count
# Extract this timeframe's features
tf_features = normalized_state[:, start_idx:end_idx]
# Normalize OHLCV data by the first close price in the window
# This makes price movements relative rather than absolute
price_idx = 3 # Assuming close price is at index 3
if price_idx < tf_features.shape[1]:
reference_price = np.mean(tf_features[:, price_idx])
if reference_price != 0:
# Normalize price-related columns (OHLC)
for i in range(4): # First 4 columns are OHLC
if i < tf_features.shape[1]:
normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price
# Normalize volume using mean and std
vol_idx = 4 # Assuming volume is at index 4
if vol_idx < tf_features.shape[1]:
vol_mean = np.mean(tf_features[:, vol_idx])
vol_std = np.std(tf_features[:, vol_idx])
if vol_std > 0:
normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
else:
normalized_state[:, start_idx + vol_idx] = 0
# Other features (technical indicators) - normalize with min-max scaling
for i in range(5, feature_count):
if i < tf_features.shape[1]:
feature_min = np.min(tf_features[:, i])
feature_max = np.max(tf_features[:, i])
if feature_max > feature_min:
normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
else:
normalized_state[:, start_idx + i] = 0
return normalized_state
def replay(self, use_prioritized=True) -> float:
""" """
Train on a batch of experiences Train on a batch of experiences with prioritized sampling
Args: Args:
use_extrema: Whether to include extrema samples in training use_prioritized: Whether to use prioritized replay
Returns: Returns:
float: Loss value float: Loss value
@ -126,55 +224,67 @@ class DQNAgent:
if len(self.memory) < self.batch_size: if len(self.memory) < self.batch_size:
return 0.0 return 0.0
# Sample batch - mix regular and extrema samples # Sample batch with prioritization
batch = [] batch = []
if use_extrema and len(self.extrema_memory) > self.batch_size // 4:
# Get some extrema samples
extrema_count = min(self.batch_size // 3, len(self.extrema_memory))
extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
# Get regular samples for the rest if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
regular_count = self.batch_size - extrema_count # Prioritized sampling from different memory types
positive_count = min(self.batch_size // 4, len(self.positive_memory))
extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
regular_count = self.batch_size - positive_count - extrema_count
positive_samples = random.sample(list(self.positive_memory), positive_count)
extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
regular_samples = random.sample(list(self.memory), regular_count) regular_samples = random.sample(list(self.memory), regular_count)
# Combine samples batch = positive_samples + extrema_samples + regular_samples
batch = extrema_samples + regular_samples
else: else:
# Standard sampling # Standard sampling
batch = random.sample(self.memory, self.batch_size) batch = random.sample(self.memory, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch) states, actions, rewards, next_states, dones = zip(*batch)
# Normalize states before training
normalized_states = np.array([self._normalize_state(state) for state in states])
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
# Convert to tensors and move to device # Convert to tensors and move to device
states = torch.FloatTensor(np.array(states)).to(self.device) states_tensor = torch.FloatTensor(normalized_states).to(self.device)
actions = torch.LongTensor(actions).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device)
rewards = torch.FloatTensor(rewards).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states = torch.FloatTensor(np.array(next_states)).to(self.device) next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
dones = torch.FloatTensor(dones).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device)
# Get current Q values # Get current Q values
current_q_values, extrema_pred = self.policy_net(states) current_q_values, extrema_pred = self.policy_net(states_tensor)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)) current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
# Get next Q values from target network # Get next Q values from target network (Double DQN approach)
with torch.no_grad(): with torch.no_grad():
next_q_values, _ = self.target_net(next_states) # Get actions from policy network
next_q_values = next_q_values.max(1)[0] next_actions, _ = self.policy_net(next_states_tensor)
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values next_actions = next_actions.max(1)[1].unsqueeze(1)
# Compute Q-learning loss # Get Q values from target network for those actions
q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values) next_q_values, _ = self.target_net(next_states_tensor)
next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
# If we have extrema labels (not in this implementation yet), # Compute target Q values
# we could add an additional loss for extrema prediction target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
# This would require labels for whether each state is near an extrema
# Total loss is just Q-learning loss for now # Clamp target values to prevent extreme values
loss = q_loss target_q_values = torch.clamp(target_q_values, -100, 100)
# Compute Huber loss (more robust to outliers than MSE)
loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
# Optimize # Optimize
self.optimizer.zero_grad() self.optimizer.zero_grad()
loss.backward() loss.backward()
# Apply gradient clipping to prevent exploding gradients
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
self.optimizer.step() self.optimizer.step()
# Update target network if needed # Update target network if needed
@ -201,36 +311,76 @@ class DQNAgent:
if len(states) == 0: if len(states) == 0:
return 0.0 return 0.0
# Normalize states
normalized_states = np.array([self._normalize_state(state) for state in states])
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
# Convert to tensors # Convert to tensors
states = torch.FloatTensor(np.array(states)).to(self.device) states_tensor = torch.FloatTensor(normalized_states).to(self.device)
actions = torch.LongTensor(actions).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device)
rewards = torch.FloatTensor(rewards).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states = torch.FloatTensor(np.array(next_states)).to(self.device) next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
dones = torch.FloatTensor(dones).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device)
# Forward pass # Forward pass
current_q_values, extrema_pred = self.policy_net(states) current_q_values, extrema_pred = self.policy_net(states_tensor)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)) current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
# Get next Q values # Get next Q values (Double DQN approach)
with torch.no_grad(): with torch.no_grad():
next_q_values, _ = self.target_net(next_states) next_actions, _ = self.policy_net(next_states_tensor)
next_q_values = next_q_values.max(1)[0] next_actions = next_actions.max(1)[1].unsqueeze(1)
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Higher weight for extrema training next_q_values, _ = self.target_net(next_states_tensor)
q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values) next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
# Full loss is just Q-learning loss target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
# Clamp target values
target_q_values = torch.clamp(target_q_values, -100, 100)
# Use Huber loss for extrema training
q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
# Full loss
loss = q_loss loss = q_loss
# Optimize # Optimize
self.optimizer.zero_grad() self.optimizer.zero_grad()
loss.backward() loss.backward()
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
self.optimizer.step() self.optimizer.step()
return loss.item() return loss.item()
def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
"""Update learning metrics and perform learning rate adjustments if needed"""
# Update average reward with exponential moving average
if self.avg_reward == 0:
self.avg_reward = episode_reward
else:
self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward
# Check if we're making sufficient progress
if episode_reward > (1 + best_reward_threshold) * self.best_reward:
self.best_reward = episode_reward
self.no_improvement_count = 0
return True # Improved
else:
self.no_improvement_count += 1
# If no improvement for a while, adjust learning rate
if self.no_improvement_count >= 10:
current_lr = self.optimizer.param_groups[0]['lr']
new_lr = current_lr * 0.5
if new_lr >= 1e-6: # Don't reduce below minimum threshold
for param_group in self.optimizer.param_groups:
param_group['lr'] = new_lr
logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
self.no_improvement_count = 0
return False # No improvement
def save(self, path: str): def save(self, path: str):
"""Save model and agent state""" """Save model and agent state"""
os.makedirs(os.path.dirname(path), exist_ok=True) os.makedirs(os.path.dirname(path), exist_ok=True)
@ -246,9 +396,13 @@ class DQNAgent:
'epsilon': self.epsilon, 'epsilon': self.epsilon,
'update_count': self.update_count, 'update_count': self.update_count,
'losses': self.losses, 'losses': self.losses,
'optimizer_state': self.optimizer.state_dict() 'optimizer_state': self.optimizer.state_dict(),
'best_reward': self.best_reward,
'avg_reward': self.avg_reward
} }
torch.save(state, f"{path}_agent_state.pt") torch.save(state, f"{path}_agent_state.pt")
logger.info(f"Agent state saved to {path}_agent_state.pt")
def load(self, path: str): def load(self, path: str):
"""Load model and agent state""" """Load model and agent state"""
@ -259,8 +413,19 @@ class DQNAgent:
self.target_net.load(f"{path}_target") self.target_net.load(f"{path}_target")
# Load agent state # Load agent state
state = torch.load(f"{path}_agent_state.pt") try:
self.epsilon = state['epsilon'] agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
self.update_count = state['update_count'] self.epsilon = agent_state['epsilon']
self.losses = state['losses'] self.update_count = agent_state['update_count']
self.optimizer.load_state_dict(state['optimizer_state']) self.losses = agent_state['losses']
self.optimizer.load_state_dict(agent_state['optimizer_state'])
# Load additional metrics if they exist
if 'best_reward' in agent_state:
self.best_reward = agent_state['best_reward']
if 'avg_reward' in agent_state:
self.avg_reward = agent_state['avg_reward']
logger.info(f"Agent state loaded from {path}_agent_state.pt")
except FileNotFoundError:
logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")

View File

@ -44,13 +44,46 @@ class PricePatternAttention(nn.Module):
return output, attn_weights return output, attn_weights
class AdaptiveNorm(nn.Module):
"""
Adaptive normalization layer that chooses between different normalization
methods based on input dimensions
"""
def __init__(self, num_features):
super(AdaptiveNorm, self).__init__()
self.batch_norm = nn.BatchNorm1d(num_features, affine=True)
self.group_norm = nn.GroupNorm(min(32, num_features), num_features)
self.layer_norm = nn.LayerNorm([num_features, 1])
def forward(self, x):
# Check input dimensions
batch_size, channels, seq_len = x.size()
# Choose normalization method:
# - Batch size > 1 and seq_len > 1: BatchNorm
# - Batch size == 1 or seq_len == 1: GroupNorm
# - Fallback for extreme cases: LayerNorm
if batch_size > 1 and seq_len > 1:
return self.batch_norm(x)
elif seq_len > 1:
return self.group_norm(x)
else:
# For 1D inputs (seq_len=1), we need to adjust the layer norm
# to the actual input size
if not hasattr(self, 'layer_norm_1d') or self.layer_norm_1d.normalized_shape[0] != channels:
self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
return self.layer_norm_1d(x)
class CNNModelPyTorch(nn.Module): class CNNModelPyTorch(nn.Module):
""" """
CNN model for trading with multiple timeframes CNN model for trading with multiple timeframes
""" """
def __init__(self, window_size, num_features, output_size, timeframes): def __init__(self, window_size=20, num_features=5, output_size=3, timeframes=None):
super(CNNModelPyTorch, self).__init__() super(CNNModelPyTorch, self).__init__()
if timeframes is None:
timeframes = [1]
self.window_size = window_size self.window_size = window_size
self.num_features = num_features self.num_features = num_features
self.output_size = output_size self.output_size = output_size
@ -73,27 +106,28 @@ class CNNModelPyTorch(nn.Module):
"""Create all model layers with current feature dimensions""" """Create all model layers with current feature dimensions"""
# Convolutional layers - use total_features as input channels # Convolutional layers - use total_features as input channels
self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1) self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm1d(64) self.norm1 = AdaptiveNorm(64)
self.dropout1 = nn.Dropout(0.2)
self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1) self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm1d(128) self.norm2 = AdaptiveNorm(128)
self.dropout2 = nn.Dropout(0.3)
self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1) self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm1d(256) self.norm3 = AdaptiveNorm(256)
self.dropout3 = nn.Dropout(0.4)
# Add price pattern attention layer # Add price pattern attention layer
self.attention = PricePatternAttention(256) self.attention = PricePatternAttention(256)
# Extrema detection specialized convolutional layer # Extrema detection specialized convolutional layer
self.extrema_conv = nn.Conv1d(256, 128, kernel_size=5, padding=2) self.extrema_conv = nn.Conv1d(256, 128, kernel_size=3, padding=1) # Smaller kernel for small inputs
self.extrema_bn = nn.BatchNorm1d(128) self.extrema_norm = AdaptiveNorm(128)
# Calculate size after convolutions - adjusted for attention output # Fully connected layers - input size will be determined dynamically
conv_output_size = self.window_size * 256 self.fc1 = None # Will be initialized in forward pass
# Fully connected layers
self.fc1 = nn.Linear(conv_output_size, 512)
self.fc2 = nn.Linear(512, 256) self.fc2 = nn.Linear(512, 256)
self.dropout_fc = nn.Dropout(0.5)
# Advantage and Value streams (Dueling DQN architecture) # Advantage and Value streams (Dueling DQN architecture)
self.fc3 = nn.Linear(256, self.output_size) # Advantage stream self.fc3 = nn.Linear(256, self.output_size) # Advantage stream
@ -131,46 +165,96 @@ class CNNModelPyTorch(nn.Module):
# Ensure input is on the correct device # Ensure input is on the correct device
x = x.to(self.device) x = x.to(self.device)
# Check and handle if input dimensions don't match model expectations # Check input dimensions and reshape as needed
batch_size, window_len, feature_dim = x.size() if len(x.size()) == 2:
if feature_dim != self.total_features: # If input is [batch_size, features], reshape to [batch_size, features, 1]
logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers") batch_size, feature_dim = x.size()
self.rebuild_conv_layers(feature_dim)
# Reshape input: [batch, window_size, features] -> [batch, channels, window_size] # Check and handle if input features don't match model expectations
x = x.permute(0, 2, 1) if feature_dim != self.total_features:
logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers")
self.rebuild_conv_layers(feature_dim)
# Convolutional layers # For 1D input, use a sequence length of 1
x = F.relu(self.bn1(self.conv1(x))) seq_len = 1
x = F.relu(self.bn2(self.conv2(x))) x = x.unsqueeze(2) # Reshape to [batch, features, 1]
x = F.relu(self.bn3(self.conv3(x))) elif len(x.size()) == 3:
# Standard case: [batch_size, window_size, features]
batch_size, seq_len, feature_dim = x.size()
# Check and handle if input dimensions don't match model expectations
if feature_dim != self.total_features:
logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers")
self.rebuild_conv_layers(feature_dim)
# Reshape input: [batch, window_size, features] -> [batch, features, window_size]
x = x.permute(0, 2, 1)
else:
raise ValueError(f"Unexpected input shape: {x.size()}, expected 2D or 3D tensor")
# Convolutional layers with dropout - safely handle small spatial dimensions
try:
x = self.dropout1(F.relu(self.norm1(self.conv1(x))))
x = self.dropout2(F.relu(self.norm2(self.conv2(x))))
x = self.dropout3(F.relu(self.norm3(self.conv3(x))))
except Exception as e:
logger.warning(f"Error in convolutional layers: {str(e)}")
# Fallback for very small inputs: skip some convolutions
if seq_len < 3:
# Apply a simpler convolution for very small inputs
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
# Skip last conv if we get dimension errors
try:
x = F.relu(self.conv3(x))
except:
pass
# Store conv features for extrema detection # Store conv features for extrema detection
conv_features = x conv_features = x
# Reshape for attention: [batch, channels, window_size] -> [batch, window_size, channels] # Get the current shape after convolutions
x_attention = x.permute(0, 2, 1) _, channels, conv_seq_len = x.size()
# Apply attention # Initialize fc1 if not created yet or if the shape has changed
attention_output, attention_weights = self.attention(x_attention) if self.fc1 is None:
flattened_size = channels * conv_seq_len
logger.info(f"Initializing fc1 with input size {flattened_size}")
self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
# We'll use attention directly without the residual connection # Apply extrema detection safely
# to avoid dimension mismatch issues try:
attention_reshaped = attention_output.permute(0, 2, 1) # [batch, channels, window_size] extrema_features = F.relu(self.extrema_norm(self.extrema_conv(conv_features)))
except Exception as e:
logger.warning(f"Error in extrema detection: {str(e)}")
extrema_features = conv_features # Fallback
# Apply extrema detection specialized layer # Handle attention for small sequence lengths
extrema_features = F.relu(self.extrema_bn(self.extrema_conv(conv_features))) if conv_seq_len > 1:
# Reshape for attention: [batch, channels, seq_len] -> [batch, seq_len, channels]
x_attention = x.permute(0, 2, 1)
# Use attention features directly instead of residual connection # Apply attention
# to avoid dimension mismatches try:
x = conv_features # Just use the convolutional features attention_output, attention_weights = self.attention(x_attention)
except Exception as e:
logger.warning(f"Error in attention layer: {str(e)}")
# Fallback: don't use attention
# Flatten # Flatten - get the actual shape for this batch
x = x.view(batch_size, -1) flattened_size = channels * conv_seq_len
x = x.view(batch_size, flattened_size)
# Fully connected layers # Check if we need to recreate fc1 with the correct size
if self.fc1.in_features != flattened_size:
logger.info(f"Recreating fc1 layer to match input size {flattened_size}")
self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
# Reinitialize optimizer after changing the model
self.optimizer = optim.Adam(self.parameters(), lr=0.001)
# Fully connected layers with dropout
x = F.relu(self.fc1(x)) x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x)) x = self.dropout_fc(F.relu(self.fc2(x)))
# Split into advantage and value streams # Split into advantage and value streams
advantage = self.fc3(x) advantage = self.fc3(x)

View File

@ -3,6 +3,10 @@ import gym
from gym import spaces from gym import spaces
from typing import Dict, Tuple, List from typing import Dict, Tuple, List
import pandas as pd import pandas as pd
import logging
# Configure logger
logger = logging.getLogger(__name__)
class TradingEnvironment(gym.Env): class TradingEnvironment(gym.Env):
""" """
@ -12,97 +16,284 @@ class TradingEnvironment(gym.Env):
data: pd.DataFrame, data: pd.DataFrame,
initial_balance: float = 100.0, initial_balance: float = 100.0,
fee_rate: float = 0.0002, fee_rate: float = 0.0002,
max_steps: int = 1000): max_steps: int = 1000,
window_size: int = 20,
risk_aversion: float = 0.2, # Controls how much to penalize volatility
price_scaling: str = 'zscore', # 'zscore', 'minmax', or 'raw'
reward_scaling: float = 10.0, # Scale factor for rewards
episode_penalty: float = 0.1): # Penalty for active positions at end of episode
super(TradingEnvironment, self).__init__() super(TradingEnvironment, self).__init__()
self.data = data self.data = data
self.initial_balance = initial_balance self.initial_balance = initial_balance
self.fee_rate = fee_rate self.fee_rate = fee_rate
self.max_steps = max_steps self.max_steps = max_steps
self.window_size = window_size
self.risk_aversion = risk_aversion
self.price_scaling = price_scaling
self.reward_scaling = reward_scaling
self.episode_penalty = episode_penalty
# Action space: 0 (SELL), 1 (HOLD), 2 (BUY) # Preprocess data if needed
self._preprocess_data()
# Action space: 0 (BUY), 1 (SELL), 2 (HOLD)
self.action_space = spaces.Discrete(3) self.action_space = spaces.Discrete(3)
# Observation space: price data, technical indicators, and account state # Observation space: price data, technical indicators, and account state
feature_dim = self.data.shape[1] + 3 # Adding position, equity, unrealized_pnl
self.observation_space = spaces.Box( self.observation_space = spaces.Box(
low=-np.inf, low=-np.inf,
high=np.inf, high=np.inf,
shape=(data.shape[1],), # Number of features shape=(feature_dim,),
dtype=np.float32 dtype=np.float32
) )
# Initialize state # Initialize state
self.reset() self.reset()
def _preprocess_data(self):
"""Preprocess data - normalize or standardize features"""
# Store the original data for reference
self.original_data = self.data.copy()
# Normalize price data based on the selected method
if self.price_scaling == 'zscore':
# For each feature, apply z-score normalization
for col in self.data.columns:
if col in ['open', 'high', 'low', 'close']:
mean = self.data[col].mean()
std = self.data[col].std()
if std > 0:
self.data[col] = (self.data[col] - mean) / std
# Normalize volume separately
elif col == 'volume':
mean = self.data[col].mean()
std = self.data[col].std()
if std > 0:
self.data[col] = (self.data[col] - mean) / std
elif self.price_scaling == 'minmax':
# For each feature, apply min-max scaling
for col in self.data.columns:
min_val = self.data[col].min()
max_val = self.data[col].max()
if max_val > min_val:
self.data[col] = (self.data[col] - min_val) / (max_val - min_val)
def reset(self) -> np.ndarray: def reset(self) -> np.ndarray:
"""Reset the environment to initial state""" """Reset the environment to initial state"""
self.current_step = 0 self.current_step = self.window_size
self.balance = self.initial_balance self.balance = self.initial_balance
self.position = 0 # 0: no position, 1: long position self.position = 0 # 0: no position, 1: long position, -1: short position
self.entry_price = 0 self.entry_price = 0
self.entry_time = 0
self.total_trades = 0 self.total_trades = 0
self.winning_trades = 0 self.winning_trades = 0
self.losing_trades = 0
self.total_pnl = 0 self.total_pnl = 0
self.balance_history = [self.initial_balance] self.balance_history = [self.initial_balance]
self.equity_history = [self.initial_balance]
self.max_balance = self.initial_balance self.max_balance = self.initial_balance
self.max_drawdown = 0
# Trading performance metrics
self.trade_durations = [] # Track how long trades are held
self.returns = [] # Track returns of each trade
# For analyzing trade clustering
self.last_action_time = 0
self.actions_taken = []
return self._get_observation() return self._get_observation()
def _get_observation(self) -> np.ndarray: def _get_observation(self) -> np.ndarray:
"""Get current observation state""" """Get current observation state with account information"""
return self.data.iloc[self.current_step].values # Get market data for the current step
market_data = self.data.iloc[self.current_step].values
# Get current price
current_price = self.original_data.iloc[self.current_step]['close']
# Calculate unrealized PnL
unrealized_pnl = 0
if self.position != 0:
price_diff = current_price - self.entry_price
unrealized_pnl = self.position * price_diff
# Calculate total equity (balance + unrealized PnL)
equity = self.balance + unrealized_pnl
# Normalize account state
normalized_position = self.position # -1, 0, or 1
normalized_equity = equity / self.initial_balance - 1.0 # Percent change from initial
normalized_unrealized_pnl = unrealized_pnl / self.initial_balance if self.initial_balance > 0 else 0
# Combine market data with account state
account_state = np.array([normalized_position, normalized_equity, normalized_unrealized_pnl])
observation = np.concatenate([market_data, account_state])
# Handle any NaN values
observation = np.nan_to_num(observation, nan=0.0)
return observation
def _calculate_reward(self, action: int) -> float: def _calculate_reward(self, action: int) -> float:
"""Calculate reward based on action and outcome""" """
current_price = self.data.iloc[self.current_step]['close'] Calculate reward based on action and outcome with improved risk-adjusted metrics
# If we have an open position Args:
if self.position != 0: action: The action taken (0=BUY, 1=SELL, 2=HOLD)
# Calculate PnL
pnl = self.position * (current_price - self.entry_price) / self.entry_price
fees = self.fee_rate * 2 # Entry and exit fees
# Close position Returns:
if (action == 0 and self.position > 0) or (action == 2 and self.position < 0): float: Calculated reward value
net_pnl = pnl - fees """
self.total_pnl += net_pnl # Get current price and next price
self.balance *= (1 + net_pnl) current_price = self.original_data.iloc[self.current_step]['close']
# Default reward is slightly negative to discourage excessive trading
reward = -0.0001
pnl = 0.0
# Handle different actions based on current position
if self.position == 0: # No position
if action == 0: # BUY
self.position = 1
self.entry_price = current_price
self.entry_time = self.current_step
reward = -self.fee_rate # Small penalty for trading cost
elif action == 1: # SELL (start short position)
self.position = -1
self.entry_price = current_price
self.entry_time = self.current_step
reward = -self.fee_rate # Small penalty for trading cost
# else action == 2 (HOLD) - keep the small negative reward
elif self.position > 0: # Long position
if action == 1: # SELL (close long)
# Calculate profit/loss
price_diff = current_price - self.entry_price
pnl = price_diff / self.entry_price - 2 * self.fee_rate # Account for entry and exit fees
# Adjust reward based on PnL and risk
reward = pnl * self.reward_scaling
# Track trade performance
self.total_trades += 1
if pnl > 0:
self.winning_trades += 1
else:
self.losing_trades += 1
# Calculate trade duration
trade_duration = self.current_step - self.entry_time
self.trade_durations.append(trade_duration)
# Update returns list
self.returns.append(pnl)
# Update balance and reset position
self.balance *= (1 + pnl)
self.balance_history.append(self.balance) self.balance_history.append(self.balance)
self.max_balance = max(self.max_balance, self.balance) self.max_balance = max(self.max_balance, self.balance)
self.total_pnl += pnl
self.total_trades += 1 # Reset position
if net_pnl > 0:
self.winning_trades += 1
# Reward based on PnL
reward = net_pnl * 100 # Scale up for better learning
# Additional reward for win rate
win_rate = self.winning_trades / max(1, self.total_trades)
reward += win_rate * 0.1
self.position = 0 self.position = 0
return reward
# Hold position elif action == 0: # BUY (while already long)
return pnl * 0.1 # Small reward for holding profitable positions # Penalize trying to increase an already active position
reward = -0.001
# No position # else action == 2 (HOLD) - calculate unrealized P&L for reward
if action == 1: # HOLD else:
return 0 price_diff = current_price - self.entry_price
unrealized_pnl = price_diff / self.entry_price
# Open new position # Small reward/penalty based on unrealized P&L
if action in [0, 2]: # SELL or BUY reward = unrealized_pnl * 0.05 # Scale down to encourage holding good positions
self.position = -1 if action == 0 else 1
self.entry_price = current_price
return -self.fee_rate # Small penalty for trading
return 0 elif self.position < 0: # Short position
if action == 0: # BUY (close short)
# Calculate profit/loss
price_diff = self.entry_price - current_price
pnl = price_diff / self.entry_price - 2 * self.fee_rate # Account for entry and exit fees
# Adjust reward based on PnL and risk
reward = pnl * self.reward_scaling
# Track trade performance
self.total_trades += 1
if pnl > 0:
self.winning_trades += 1
else:
self.losing_trades += 1
# Calculate trade duration
trade_duration = self.current_step - self.entry_time
self.trade_durations.append(trade_duration)
# Update returns list
self.returns.append(pnl)
# Update balance and reset position
self.balance *= (1 + pnl)
self.balance_history.append(self.balance)
self.max_balance = max(self.max_balance, self.balance)
self.total_pnl += pnl
# Reset position
self.position = 0
elif action == 1: # SELL (while already short)
# Penalize trying to increase an already active position
reward = -0.001
# else action == 2 (HOLD) - calculate unrealized P&L for reward
else:
price_diff = self.entry_price - current_price
unrealized_pnl = price_diff / self.entry_price
# Small reward/penalty based on unrealized P&L
reward = unrealized_pnl * 0.05 # Scale down to encourage holding good positions
# Record the action
self.actions_taken.append(action)
self.last_action_time = self.current_step
# Update equity history (balance + unrealized P&L)
current_equity = self.balance
if self.position != 0:
# Calculate unrealized P&L
if self.position > 0: # Long
price_diff = current_price - self.entry_price
unrealized_pnl = price_diff / self.entry_price * self.balance
else: # Short
price_diff = self.entry_price - current_price
unrealized_pnl = price_diff / self.entry_price * self.balance
current_equity = self.balance + unrealized_pnl
self.equity_history.append(current_equity)
# Calculate current drawdown
peak_equity = max(self.equity_history)
current_drawdown = (peak_equity - current_equity) / peak_equity if peak_equity > 0 else 0
self.max_drawdown = max(self.max_drawdown, current_drawdown)
# Apply risk aversion factor - penalize volatility
if len(self.returns) > 1:
returns_std = np.std(self.returns)
reward -= returns_std * self.risk_aversion
return reward, pnl
def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]: def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]:
"""Execute one step in the environment""" """Execute one step in the environment"""
# Calculate reward # Calculate reward and update state
reward = self._calculate_reward(action) reward, pnl = self._calculate_reward(action)
# Move to next step # Move to next step
self.current_step += 1 self.current_step += 1
@ -110,26 +301,70 @@ class TradingEnvironment(gym.Env):
# Check if episode is done # Check if episode is done
done = self.current_step >= min(self.max_steps - 1, len(self.data) - 1) done = self.current_step >= min(self.max_steps - 1, len(self.data) - 1)
# Apply penalty if episode ends with open position
if done and self.position != 0:
reward -= self.episode_penalty
# Force close the position at the end if still open
current_price = self.original_data.iloc[self.current_step]['close']
if self.position > 0: # Long position
price_diff = current_price - self.entry_price
pnl = price_diff / self.entry_price - 2 * self.fee_rate
else: # Short position
price_diff = self.entry_price - current_price
pnl = price_diff / self.entry_price - 2 * self.fee_rate
# Update balance
self.balance *= (1 + pnl)
self.total_pnl += pnl
# Track trade
self.total_trades += 1
if pnl > 0:
self.winning_trades += 1
else:
self.losing_trades += 1
# Reset position
self.position = 0
# Get next observation # Get next observation
observation = self._get_observation() observation = self._get_observation()
# Calculate max drawdown # Calculate sharpe ratio and sortino ratio if possible
max_drawdown = 0 sharpe_ratio = 0
if len(self.balance_history) > 1: sortino_ratio = 0
peak = self.balance_history[0] win_rate = self.winning_trades / max(1, self.total_trades)
for balance in self.balance_history:
peak = max(peak, balance) if len(self.returns) > 1:
drawdown = (peak - balance) / peak mean_return = np.mean(self.returns)
max_drawdown = max(max_drawdown, drawdown) std_return = np.std(self.returns)
if std_return > 0:
sharpe_ratio = mean_return / std_return
# For sortino, we only consider downside deviation
downside_returns = [r for r in self.returns if r < 0]
if downside_returns:
downside_deviation = np.std(downside_returns)
if downside_deviation > 0:
sortino_ratio = mean_return / downside_deviation
# Calculate average trade duration
avg_trade_duration = np.mean(self.trade_durations) if self.trade_durations else 0
# Additional info # Additional info
info = { info = {
'balance': self.balance, 'balance': self.balance,
'position': self.position, 'position': self.position,
'total_trades': self.total_trades, 'total_trades': self.total_trades,
'win_rate': self.winning_trades / max(1, self.total_trades), 'win_rate': win_rate,
'total_pnl': self.total_pnl, 'total_pnl': self.total_pnl,
'max_drawdown': max_drawdown 'max_drawdown': self.max_drawdown,
'sharpe_ratio': sharpe_ratio,
'sortino_ratio': sortino_ratio,
'avg_trade_duration': avg_trade_duration,
'pnl': pnl,
'gain': (self.balance - self.initial_balance) / self.initial_balance
} }
return observation, reward, done, info return observation, reward, done, info
@ -143,20 +378,19 @@ class TradingEnvironment(gym.Env):
print(f"Total Trades: {self.total_trades}") print(f"Total Trades: {self.total_trades}")
print(f"Win Rate: {self.winning_trades/max(1, self.total_trades):.2%}") print(f"Win Rate: {self.winning_trades/max(1, self.total_trades):.2%}")
print(f"Total PnL: ${self.total_pnl:.2f}") print(f"Total PnL: ${self.total_pnl:.2f}")
print(f"Max Drawdown: {self._calculate_max_drawdown():.2%}") print(f"Max Drawdown: {self.max_drawdown:.2%}")
print(f"Sharpe Ratio: {self._calculate_sharpe_ratio():.4f}")
print("-" * 50) print("-" * 50)
def _calculate_max_drawdown(self): def _calculate_sharpe_ratio(self):
"""Calculate maximum drawdown from balance history""" """Calculate Sharpe ratio from returns"""
if len(self.balance_history) <= 1: if len(self.returns) < 2:
return 0.0 return 0.0
peak = self.balance_history[0] mean_return = np.mean(self.returns)
max_drawdown = 0.0 std_return = np.std(self.returns)
for balance in self.balance_history: if std_return == 0:
peak = max(peak, balance) return 0.0
drawdown = (peak - balance) / peak
max_drawdown = max(max_drawdown, drawdown)
return max_drawdown return mean_return / std_return

File diff suppressed because it is too large Load Diff

3083
realtime_old.py Normal file

File diff suppressed because it is too large Load Diff

547
train_improved_rl.py Normal file
View File

@ -0,0 +1,547 @@
#!/usr/bin/env python
"""
Improved RL Trading with Enhanced Training and Monitoring
This script provides an improved version of the RL training process,
implementing better normalization, reward structure, and model training.
"""
import os
import sys
import logging
import argparse
import time
from datetime import datetime
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
# Add project directory to path if needed
project_root = os.path.dirname(os.path.abspath(__file__))
if project_root not in sys.path:
sys.path.append(project_root)
# Import our custom modules
from NN.models.dqn_agent import DQNAgent
from NN.utils.trading_env import TradingEnvironment
from NN.utils.data_interface import DataInterface
from realtime import BinanceHistoricalData, RealTimeChart
# Configure logging
log_filename = f'improved_rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_filename),
logging.StreamHandler()
]
)
logger = logging.getLogger('improved_rl')
# Parse command line arguments
parser = argparse.ArgumentParser(description='Improved RL Trading with Enhanced Training')
parser.add_argument('--episodes', type=int, default=20, help='Number of episodes to train')
parser.add_argument('--visualize', action='store_true', help='Visualize trades during training')
parser.add_argument('--save-path', type=str, default='NN/models/saved/improved_dqn_agent', help='Path to save trained model')
parser.add_argument('--symbol', type=str, default='ETH/USDT', help='Trading pair symbol')
args = parser.parse_args()
def create_training_environment(symbol, window_size=20):
"""Create and prepare the training environment with data"""
logger.info(f"Setting up training environment for {symbol}")
# Fetch historical data from multiple timeframes
data_interface = DataInterface(symbol)
# Use Binance data provider for fetching data
historical_data = BinanceHistoricalData()
# Fetch data for each timeframe
df_1m = historical_data.get_historical_candles(symbol, interval_seconds=60, limit=1000)
df_5m = historical_data.get_historical_candles(symbol, interval_seconds=300, limit=1000)
df_15m = historical_data.get_historical_candles(symbol, interval_seconds=900, limit=500)
# Ensure all dataframes have index as timestamp type
if df_1m is not None and not df_1m.empty:
if 'timestamp' in df_1m.columns:
df_1m = df_1m.set_index('timestamp')
if df_5m is not None and not df_5m.empty:
if 'timestamp' in df_5m.columns:
df_5m = df_5m.set_index('timestamp')
if df_15m is not None and not df_15m.empty:
if 'timestamp' in df_15m.columns:
df_15m = df_15m.set_index('timestamp')
# Preprocess data (add technical indicators)
df_1m = preprocess_dataframe(df_1m)
df_5m = preprocess_dataframe(df_5m)
df_15m = preprocess_dataframe(df_15m)
# Create environment with all timeframes
env = create_multi_timeframe_env(df_1m, df_5m, df_15m, window_size)
return env, (df_1m, df_5m, df_15m)
def preprocess_dataframe(df):
"""Add technical indicators and preprocess dataframe"""
if df is None or df.empty:
return None
# Drop any missing values
df = df.dropna()
# Ensure it has OHLCV columns
required_columns = ['open', 'high', 'low', 'close', 'volume']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
logger.warning(f"Missing required columns: {missing_columns}")
for col in missing_columns:
# Fill with close price for OHLC if missing
if col in ['open', 'high', 'low'] and 'close' in df.columns:
df[col] = df['close']
# Fill with zeros for volume if missing
elif col == 'volume':
df[col] = 0
# Add simple technical indicators
# 1. Simple Moving Averages
df['sma_5'] = df['close'].rolling(window=5).mean()
df['sma_10'] = df['close'].rolling(window=10).mean()
# 2. Relative Strength Index (RSI)
delta = df['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['rsi'] = 100 - (100 / (1 + rs))
# 3. Bollinger Bands
df['bb_middle'] = df['close'].rolling(window=20).mean()
df['bb_std'] = df['close'].rolling(window=20).std()
df['bb_upper'] = df['bb_middle'] + 2 * df['bb_std']
df['bb_lower'] = df['bb_middle'] - 2 * df['bb_std']
# 4. MACD
df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()
df['macd'] = df['ema_12'] - df['ema_26']
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
# 5. Price rate of change
df['roc'] = df['close'].pct_change(periods=10) * 100
# Fill any remaining NaN values with 0
df = df.fillna(0)
return df
def create_multi_timeframe_env(df_1m, df_5m, df_15m, window_size=20):
"""Create a custom environment that handles multiple timeframes"""
# Ensure we have complete data for all timeframes
min_required_length = window_size + 100 # Add buffer for training
if (df_1m is None or len(df_1m) < min_required_length or
df_5m is None or len(df_5m) < min_required_length or
df_15m is None or len(df_15m) < min_required_length):
raise ValueError(f"Insufficient data for training. Need at least {min_required_length} candles per timeframe.")
# Ensure we only use the last N valid data points
df_1m = df_1m.iloc[-900:].copy() if len(df_1m) > 900 else df_1m.copy()
df_5m = df_5m.iloc[-180:].copy() if len(df_5m) > 180 else df_5m.copy()
df_15m = df_15m.iloc[-60:].copy() if len(df_15m) > 60 else df_15m.copy()
# Reset index to make sure we have continuous integers
df_1m = df_1m.reset_index(drop=True)
df_5m = df_5m.reset_index(drop=True)
df_15m = df_15m.reset_index(drop=True)
# For simplicity, we'll use the 1m data as the base environment
# The other timeframes will be incorporated through observation
env = TradingEnvironment(
data=df_1m,
initial_balance=100.0,
fee_rate=0.0005, # 0.05% fee (typical for crypto exchanges)
max_steps=len(df_1m) - window_size - 50, # Leave some room at the end
window_size=window_size,
risk_aversion=0.2, # Moderately risk-averse
price_scaling='zscore', # Use z-score normalization
reward_scaling=10.0, # Scale rewards for better learning
episode_penalty=0.2 # Penalty for holding positions at end of episode
)
return env
def initialize_agent(env, window_size=20, num_features=0, timeframes=None):
"""Initialize the DQN agent with appropriate parameters"""
if timeframes is None:
timeframes = ['1m', '5m', '15m']
# Calculate input dimensions
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
# If num_features wasn't provided, infer from environment
if num_features == 0:
# Calculate features per timeframe from state dimension and number of timeframes
# Accounting for the 3 additional features (position, equity, unrealized_pnl)
num_features = (state_dim - 3) // len(timeframes)
logger.info(f"Initializing DQN agent: state_dim={state_dim}, action_dim={action_dim}, features={num_features}")
agent = DQNAgent(
state_size=state_dim,
action_size=action_dim,
window_size=window_size,
num_features=num_features,
timeframes=timeframes,
learning_rate=0.0005, # Start with a moderate learning rate
gamma=0.97, # Slightly reduced discount factor for stable learning
epsilon=1.0, # Start with full exploration
epsilon_min=0.05, # Maintain some exploration even at the end
epsilon_decay=0.9975, # Slower decay for more exploration
memory_size=20000, # Larger replay buffer
batch_size=128, # Larger batch size for more stable gradients
target_update=5 # More frequent target network updates
)
return agent
def train_agent(env, agent, num_episodes=20, visualize=False, chart=None, save_path=None, save_freq=5):
"""
Train the DQN agent with improved training loop
Args:
env: The trading environment
agent: The DQN agent
num_episodes: Number of episodes to train
visualize: Whether to visualize trades during training
chart: The visualization chart (if visualize=True)
save_path: Path to save the model
save_freq: How often to save checkpoints (in episodes)
Returns:
tuple: (rewards, wins, losses, best_reward)
"""
logger.info(f"Starting training for {num_episodes} episodes")
# Initialize metrics tracking
rewards = []
win_rates = []
total_train_time = 0
best_reward = float('-inf')
best_model_path = None
# Create directory for checkpoints if needed
if save_path:
os.makedirs(os.path.dirname(save_path), exist_ok=True)
checkpoint_dir = os.path.join(os.path.dirname(save_path), 'checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)
# For tracking improvement
last_improved_episode = 0
patience = 10 # Episodes to wait for improvement before early stopping
for episode in range(num_episodes):
start_time = time.time()
# Reset environment and get initial state
state = env.reset()
done = False
episode_reward = 0
step = 0
# Action metrics for this episode
actions_taken = {0: 0, 1: 0, 2: 0} # Track BUY, SELL, HOLD actions
while not done:
# Select action
action = agent.act(state)
# Execute action
next_state, reward, done, info = env.step(action)
# Store experience in replay buffer
is_extrema = False # In a real implementation, detect extrema points
agent.remember(state, action, reward, next_state, done, is_extrema)
# Learn from experience
if len(agent.memory) >= agent.batch_size:
use_prioritized = episode > 1 # Start using prioritized replay after first episode
loss = agent.replay(use_prioritized=use_prioritized)
# Update state and metrics
state = next_state
episode_reward += reward
actions_taken[action] += 1
# Every 100 steps, log progress
if step % 100 == 0 or step < 10:
action_str = "BUY" if action == 0 else "SELL" if action == 1 else "HOLD"
current_price = info.get('current_price', 0)
pnl = info.get('pnl', 0)
balance = info.get('balance', 0)
logger.info(f"Episode {episode}, Step {step}: Action={action_str}, "
f"Reward={reward:.4f}, Balance=${balance:.2f}, PnL={pnl:.4f}")
# Add trade to visualization if enabled
if visualize and chart and action in [0, 1]: # BUY or SELL
chart.add_trade(
price=current_price,
timestamp=datetime.now(),
amount=0.1,
pnl=pnl,
action=action_str
)
step += 1
# Episode finished - calculate metrics
episode_time = time.time() - start_time
total_train_time += episode_time
# Get environment info
win_rate = env.winning_trades / max(1, env.total_trades)
trades = env.total_trades
balance = env.balance
gain = (balance - env.initial_balance) / env.initial_balance
max_drawdown = env.max_drawdown
# Record metrics
rewards.append(episode_reward)
win_rates.append(win_rate)
# Update agent's learning metrics
improved = agent.update_learning_metrics(episode_reward)
# If this is best performance, save the model
if episode_reward > best_reward:
best_reward = episode_reward
if save_path:
best_model_path = f"{save_path}_best"
agent.save(best_model_path)
logger.info(f"New best model saved to {best_model_path} (reward: {best_reward:.2f})")
last_improved_episode = episode
# Regular checkpoint saving
if save_path and episode % save_freq == 0:
checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_episode_{episode}")
agent.save(checkpoint_path)
# Print episode summary
actions_summary = ", ".join([f"{k}:{v}" for k, v in actions_taken.items()])
logger.info(f"Episode {episode} completed in {episode_time:.2f}s")
logger.info(f" Total reward: {episode_reward:.4f}")
logger.info(f" Actions taken: {actions_summary}")
logger.info(f" Trades: {trades}, Win rate: {win_rate:.2%}")
logger.info(f" Balance: ${balance:.2f}, Gain: {gain:.2%}")
logger.info(f" Max Drawdown: {max_drawdown:.2%}")
# Early stopping check
if episode - last_improved_episode >= patience:
logger.info(f"No improvement for {patience} episodes. Early stopping.")
break
# Training complete
avg_time_per_episode = total_train_time / max(1, len(rewards))
logger.info(f"Training completed in {total_train_time:.2f}s ({avg_time_per_episode:.2f}s per episode)")
# Save final model
if save_path:
agent.save(f"{save_path}_final")
logger.info(f"Final model saved to {save_path}_final")
# Return training metrics
return rewards, win_rates, best_reward, best_model_path
def plot_training_results(rewards, win_rates, save_dir=None):
"""Plot training metrics and save the figure"""
plt.figure(figsize=(12, 8))
# Plot rewards
plt.subplot(2, 1, 1)
plt.plot(rewards, 'b-')
plt.title('Training Rewards per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True)
# Plot win rates
plt.subplot(2, 1, 2)
plt.plot(win_rates, 'g-')
plt.title('Win Rate per Episode')
plt.xlabel('Episode')
plt.ylabel('Win Rate')
plt.grid(True)
plt.tight_layout()
# Save figure if directory provided
if save_dir:
os.makedirs(save_dir, exist_ok=True)
plt.savefig(os.path.join(save_dir, f'training_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png'))
plt.close()
def evaluate_agent(env, agent, num_episodes=5, visualize=False, chart=None):
"""
Evaluate a trained agent on the environment
Args:
env: The trading environment
agent: The trained DQN agent
num_episodes: Number of evaluation episodes
visualize: Whether to visualize trades
chart: The visualization chart (if visualize=True)
Returns:
dict: Evaluation metrics
"""
logger.info(f"Evaluating agent over {num_episodes} episodes")
# Metrics to track
total_rewards = []
total_trades = []
win_rates = []
sharpe_ratios = []
sortino_ratios = []
max_drawdowns = []
final_balances = []
for episode in range(num_episodes):
# Reset environment
state = env.reset()
done = False
episode_reward = 0
# Run episode without exploration
while not done:
action = agent.act(state, explore=False) # No exploration during evaluation
next_state, reward, done, info = env.step(action)
episode_reward += reward
state = next_state
# Add trade to visualization if enabled
if visualize and chart and action in [0, 1]: # BUY or SELL
action_str = "BUY" if action == 0 else "SELL"
current_price = info.get('current_price', 0)
pnl = info.get('pnl', 0)
chart.add_trade(
price=current_price,
timestamp=datetime.now(),
amount=0.1,
pnl=pnl,
action=action_str
)
# Record metrics
total_rewards.append(episode_reward)
total_trades.append(env.total_trades)
win_rates.append(env.winning_trades / max(1, env.total_trades))
sharpe_ratios.append(info.get('sharpe_ratio', 0))
sortino_ratios.append(info.get('sortino_ratio', 0))
max_drawdowns.append(env.max_drawdown)
final_balances.append(env.balance)
logger.info(f"Evaluation episode {episode} - Reward: {episode_reward:.4f}, "
f"Balance: ${env.balance:.2f}, Win rate: {win_rates[-1]:.2%}")
# Calculate average metrics
avg_reward = np.mean(total_rewards)
avg_trades = np.mean(total_trades)
avg_win_rate = np.mean(win_rates)
avg_sharpe = np.mean(sharpe_ratios)
avg_sortino = np.mean(sortino_ratios)
avg_max_drawdown = np.mean(max_drawdowns)
avg_final_balance = np.mean(final_balances)
# Log evaluation summary
logger.info("Evaluation completed:")
logger.info(f" Average reward: {avg_reward:.4f}")
logger.info(f" Average trades per episode: {avg_trades:.2f}")
logger.info(f" Average win rate: {avg_win_rate:.2%}")
logger.info(f" Average Sharpe ratio: {avg_sharpe:.4f}")
logger.info(f" Average Sortino ratio: {avg_sortino:.4f}")
logger.info(f" Average max drawdown: {avg_max_drawdown:.2%}")
logger.info(f" Average final balance: ${avg_final_balance:.2f}")
# Return evaluation metrics
return {
'avg_reward': avg_reward,
'avg_trades': avg_trades,
'avg_win_rate': avg_win_rate,
'avg_sharpe': avg_sharpe,
'avg_sortino': avg_sortino,
'avg_max_drawdown': avg_max_drawdown,
'avg_final_balance': avg_final_balance
}
def main():
"""Main function to run the improved RL training"""
start_time = time.time()
logger.info(f"Starting improved RL training for {args.symbol}")
# Create environment
env, data_frames = create_training_environment(args.symbol)
# Initialize visualization if enabled
chart = None
if args.visualize:
logger.info("Initializing visualization chart")
chart = RealTimeChart(args.symbol)
time.sleep(2) # Give time for chart to initialize
# Initialize agent
agent = initialize_agent(env)
# Train agent
rewards, win_rates, best_reward, best_model_path = train_agent(
env=env,
agent=agent,
num_episodes=args.episodes,
visualize=args.visualize,
chart=chart,
save_path=args.save_path
)
# Plot training results
plot_dir = os.path.join(os.path.dirname(args.save_path), 'plots')
plot_training_results(rewards, win_rates, save_dir=plot_dir)
# Evaluate best model
logger.info("Evaluating best model")
# Load best model for evaluation
if best_model_path:
best_agent = initialize_agent(env)
best_agent.load(best_model_path)
# Evaluate the best model
eval_metrics = evaluate_agent(
env=env,
agent=best_agent,
visualize=args.visualize,
chart=chart
)
# Log evaluation results
logger.info("Best model evaluation complete:")
for metric, value in eval_metrics.items():
logger.info(f" {metric}: {value}")
# Total run time
total_time = time.time() - start_time
logger.info(f"Total run time: {total_time:.2f} seconds")
if __name__ == "__main__":
main()