initial movel changes to fix performance

This commit is contained in:
Dobromir Popov
2025-04-02 14:03:20 +03:00
parent aec536d007
commit 70eb7bba9b
8 changed files with 1619 additions and 279 deletions

View File

@ -8,6 +8,7 @@ from typing import Tuple, List
import os
import sys
import logging
import torch.nn.functional as F
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@ -20,71 +21,124 @@ logger = logging.getLogger(__name__)
class DQNAgent:
"""
Deep Q-Network agent for trading
Uses CNN model as the base network
Uses CNN model as the base network with GPU support
"""
def __init__(self,
state_size: int,
action_size: int,
window_size: int,
num_features: int,
timeframes: List[str],
state_shape: Tuple[int, ...],
n_actions: int,
learning_rate: float = 0.0005, # Reduced learning rate for more stability
gamma: float = 0.97, # Slightly reduced discount factor
epsilon: float = 1.0,
epsilon_min: float = 0.05, # Increased minimum epsilon for more exploration
epsilon_decay: float = 0.9975, # Slower decay rate
memory_size: int = 20000, # Increased memory size
buffer_size: int = 20000, # Increased memory size
batch_size: int = 128, # Larger batch size
target_update: int = 5): # More frequent target updates
target_update: int = 5, # More frequent target updates
device=None): # Device for computations
self.state_size = state_size
self.action_size = action_size
self.window_size = window_size
self.num_features = num_features
self.timeframes = timeframes
# Extract state dimensions
if isinstance(state_shape, tuple) and len(state_shape) > 1:
# Multi-dimensional state (like image or sequence)
self.state_dim = state_shape
else:
# 1D state
if isinstance(state_shape, tuple):
self.state_dim = state_shape[0]
else:
self.state_dim = state_shape
# Store parameters
self.n_actions = n_actions
self.learning_rate = learning_rate
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.memory_size = memory_size
self.buffer_size = buffer_size
self.batch_size = batch_size
self.target_update = target_update
# Device configuration
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Set device for computation (default to CPU)
if device is None:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
self.device = device
# Initialize networks
self.policy_net = CNNModelPyTorch(
window_size=window_size,
num_features=num_features,
output_size=action_size,
timeframes=timeframes
).to(self.device)
# Initialize models with appropriate architecture based on state shape
if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
# For image-like states (from RL environment with CNN)
from NN.models.simple_cnn import SimpleCNN
self.policy_net = SimpleCNN(self.state_dim, self.n_actions)
self.target_net = SimpleCNN(self.state_dim, self.n_actions)
else:
# For 1D state vectors (most environments)
from NN.models.simple_mlp import SimpleMLP
self.policy_net = SimpleMLP(self.state_dim, self.n_actions)
self.target_net = SimpleMLP(self.state_dim, self.n_actions)
self.target_net = CNNModelPyTorch(
window_size=window_size,
num_features=num_features,
output_size=action_size,
timeframes=timeframes
).to(self.device)
# Initialize the target network with the same weights as the policy network
self.target_net.load_state_dict(self.policy_net.state_dict())
# Initialize optimizer with gradient clipping
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
# Set models to eval mode (important for batch norm, dropout)
self.target_net.eval()
# Initialize memories with different priorities
self.memory = deque(maxlen=memory_size)
self.extrema_memory = deque(maxlen=memory_size // 4) # For extrema points
self.positive_memory = deque(maxlen=memory_size // 4) # For positive rewards
# Optimization components
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
self.criterion = nn.MSELoss()
# Training metrics
# Experience replay memory
self.memory = []
self.positive_memory = [] # Special memory for storing good experiences
self.update_count = 0
self.losses = []
self.avg_reward = 0
self.no_improvement_count = 0
self.best_reward = float('-inf')
# Extrema detection tracking
self.last_extrema_pred = {
'class': 2, # Default to "neither" (not extrema)
'confidence': 0.0,
'raw': None
}
self.extrema_memory = [] # Special memory for storing extrema points
# Performance tracking
self.losses = []
self.avg_reward = 0.0
self.best_reward = -float('inf')
self.no_improvement_count = 0
# Check if mixed precision training should be used
self.use_mixed_precision = False
if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
self.use_mixed_precision = True
self.scaler = torch.cuda.amp.GradScaler()
logger.info("Mixed precision training enabled")
else:
logger.info("Mixed precision training disabled")
# Track if we're in training mode
self.training = True
# For compatibility with old code
self.state_size = np.prod(state_shape)
self.action_size = n_actions
self.memory_size = buffer_size
self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0]] # Default timeframes
logger.info(f"DQN Agent using device: {self.device}")
def move_models_to_device(self, device=None):
"""Move models to the specified device (GPU/CPU)"""
if device is not None:
self.device = device
try:
self.policy_net = self.policy_net.to(self.device)
self.target_net = self.target_net.to(self.device)
logger.info(f"Moved models to {self.device}")
return True
except Exception as e:
logger.error(f"Failed to move models to {self.device}: {str(e)}")
return False
def remember(self, state: np.ndarray, action: int, reward: float,
next_state: np.ndarray, done: bool, is_extrema: bool = False):
"""
@ -103,25 +157,472 @@ class DQNAgent:
# Always add to main memory
self.memory.append(experience)
# Add to specialized memories if applicable
if is_extrema:
# Check if this is an extrema point based on our extrema detection head
if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
# Class 0 = bottom, 1 = top, 2 = neither
# Only consider high confidence predictions
if self.last_extrema_pred['confidence'] > 0.7:
self.extrema_memory.append(experience)
# Log this special experience
extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")
# For tops and bottoms, also duplicate the experience in memory to learn more from it
for _ in range(2): # Add 2 extra copies
self.memory.append(experience)
# Explicitly marked extrema points also go to extrema memory
elif is_extrema:
self.extrema_memory.append(experience)
# Store positive experiences separately for prioritized replay
if reward > 0:
self.positive_memory.append(experience)
# For very good rewards, duplicate to learn more from them
if reward > 0.1:
for _ in range(min(int(reward * 10), 5)): # Cap at 5 extra copies for very high rewards
self.positive_memory.append(experience)
# Keep memory size under control
if len(self.memory) > self.buffer_size:
# Keep more recent experiences
self.memory = self.memory[-self.buffer_size:]
# Keep specialized memories under control too
if len(self.positive_memory) > self.buffer_size // 4:
self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]
if len(self.extrema_memory) > self.buffer_size // 4:
self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]
def act(self, state: np.ndarray, explore=True) -> int:
"""Choose action using epsilon-greedy policy with explore flag"""
if explore and random.random() < self.epsilon:
return random.randrange(self.action_size)
return random.randrange(self.n_actions)
with torch.no_grad():
# Ensure state is normalized before inference
state_tensor = self._normalize_state(state)
state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
# Get predictions using the policy network
self.policy_net.eval() # Set to evaluation mode for inference
action_probs, extrema_pred = self.policy_net(state_tensor)
return action_probs.argmax().item()
self.policy_net.train() # Back to training mode
# Get the predicted extrema class (0=bottom, 1=top, 2=neither)
extrema_class = extrema_pred.argmax(dim=1).item()
extrema_confidence = torch.softmax(extrema_pred, dim=1)[0, extrema_class].item()
# Log extrema prediction for significant signals
if extrema_confidence > 0.7 and extrema_class != 2: # Only log strong top/bottom signals
extrema_type = "BOTTOM" if extrema_class == 0 else "TOP" if extrema_class == 1 else "NEITHER"
logger.info(f"High confidence {extrema_type} detected! Confidence: {extrema_confidence:.4f}")
# Store extrema prediction for the environment to use
self.last_extrema_pred = {
'class': extrema_class,
'confidence': extrema_confidence,
'raw': extrema_pred.cpu().numpy()
}
# Get the action with highest Q-value
action = action_probs.argmax().item()
# Adjust action based on extrema prediction (with some probability)
if extrema_confidence > 0.8: # Only adjust for strong signals
if extrema_class == 0: # Bottom detected
# Bias toward BUY at bottoms
if action != 0 and random.random() < 0.3 * extrema_confidence:
logger.info(f"Adjusting action to BUY based on bottom detection")
action = 0 # BUY
elif extrema_class == 1: # Top detected
# Bias toward SELL at tops
if action != 1 and random.random() < 0.3 * extrema_confidence:
logger.info(f"Adjusting action to SELL based on top detection")
action = 1 # SELL
return action
def replay(self, use_prioritized=True) -> float:
"""Experience replay - learn from stored experiences
Args:
use_prioritized: Whether to use prioritized experience replay
Returns:
float: Training loss
"""
# Check if we have enough samples
if len(self.memory) < self.batch_size:
return 0.0
# Check if mixed precision should be disabled
if 'DISABLE_MIXED_PRECISION' in os.environ:
self.use_mixed_precision = False
# Sample from memory with or without prioritization
if use_prioritized and len(self.positive_memory) > self.batch_size // 4:
# Use prioritized sampling: mix normal samples with positive reward samples
positive_batch_size = min(self.batch_size // 4, len(self.positive_memory))
regular_batch_size = self.batch_size - positive_batch_size
# Get positive examples
positive_batch = random.sample(self.positive_memory, positive_batch_size)
# Get regular examples
regular_batch = random.sample(self.memory, regular_batch_size)
# Combine batches
minibatch = positive_batch + regular_batch
else:
# Use regular uniform sampling
minibatch = random.sample(self.memory, self.batch_size)
# Extract batches with proper tensor conversion
states = np.vstack([self._normalize_state(x[0]) for x in minibatch])
actions = np.array([x[1] for x in minibatch])
rewards = np.array([x[2] for x in minibatch])
next_states = np.vstack([self._normalize_state(x[3]) for x in minibatch])
dones = np.array([x[4] for x in minibatch], dtype=np.float32)
# Convert to torch tensors and move to device
states_tensor = torch.FloatTensor(states).to(self.device)
actions_tensor = torch.LongTensor(actions).to(self.device)
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states_tensor = torch.FloatTensor(next_states).to(self.device)
dones_tensor = torch.FloatTensor(dones).to(self.device)
# First training step with mixed precision if available
if self.use_mixed_precision:
loss = self._replay_mixed_precision(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
else:
loss = self._replay_standard(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
# Occasionally train specifically on extrema points, if we have enough
if hasattr(self, 'extrema_memory') and len(self.extrema_memory) >= self.batch_size // 2:
if random.random() < 0.3: # 30% chance to do extra extrema training
# Sample from extrema memory
extrema_batch_size = min(self.batch_size // 2, len(self.extrema_memory))
extrema_batch = random.sample(self.extrema_memory, extrema_batch_size)
# Extract batches with proper tensor conversion
extrema_states = np.vstack([self._normalize_state(x[0]) for x in extrema_batch])
extrema_actions = np.array([x[1] for x in extrema_batch])
extrema_rewards = np.array([x[2] for x in extrema_batch])
extrema_next_states = np.vstack([self._normalize_state(x[3]) for x in extrema_batch])
extrema_dones = np.array([x[4] for x in extrema_batch], dtype=np.float32)
# Convert to torch tensors and move to device
extrema_states_tensor = torch.FloatTensor(extrema_states).to(self.device)
extrema_actions_tensor = torch.LongTensor(extrema_actions).to(self.device)
extrema_rewards_tensor = torch.FloatTensor(extrema_rewards).to(self.device)
extrema_next_states_tensor = torch.FloatTensor(extrema_next_states).to(self.device)
extrema_dones_tensor = torch.FloatTensor(extrema_dones).to(self.device)
# Additional training step focused on extrema points (with smaller learning rate)
original_lr = self.optimizer.param_groups[0]['lr']
# Temporarily reduce learning rate for fine-tuning on extrema
for param_group in self.optimizer.param_groups:
param_group['lr'] = original_lr * 0.5
# Train on extrema
if self.use_mixed_precision:
extrema_loss = self._replay_mixed_precision(
extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
extrema_next_states_tensor, extrema_dones_tensor
)
else:
extrema_loss = self._replay_standard(
extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
extrema_next_states_tensor, extrema_dones_tensor
)
# Restore original learning rate
for param_group in self.optimizer.param_groups:
param_group['lr'] = original_lr
logger.info(f"Extra training on extrema points: loss={extrema_loss:.4f}")
# Average the loss
loss = (loss + extrema_loss) / 2
# Store and return loss
self.losses.append(loss)
return loss
def _replay_standard(self, states, actions, rewards, next_states, dones):
"""Standard precision training step"""
# Zero gradients
self.optimizer.zero_grad()
# Get current Q values and extrema predictions
current_q_values, current_extrema_pred = self.policy_net(states)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
# Get next Q values from target network
with torch.no_grad():
next_q_values, next_extrema_pred = self.target_net(next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch and fix it
if rewards.shape[0] != next_q_values.shape[0]:
# Log the shape mismatch for debugging
logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index errors
min_size = min(rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
current_q_values = current_q_values[:min_size]
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute Q-value loss (primary task)
q_loss = nn.MSELoss()(current_q_values, target_q_values)
# Create extrema labels from price movements (crude approximation)
# If the next state price is higher than current, we might be in an uptrend (not a bottom)
# If the next state price is lower than current, we might be in a downtrend (not a top)
# This is a simplified approximation; in real scenarios we'd want to use actual extrema detection
# Try to extract price from current and next states
# Assuming price is in the last feature
try:
# Extract price feature from sequence data (if available)
if len(states.shape) == 3: # [batch, seq, features]
current_prices = states[:, -1, -1] # Last timestep, last feature
next_prices = next_states[:, -1, -1]
else: # [batch, features]
current_prices = states[:, -1] # Last feature
next_prices = next_states[:, -1]
# Compute price changes
price_changes = (next_prices - current_prices) / current_prices
# Create crude extrema labels:
# 0 = bottom: Large negative price change followed by positive change
# 1 = top: Large positive price change followed by negative change
# 2 = neither: Small or inconsistent changes
# Classify based on price change magnitude
extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2 # Default: neither
# Identify potential bottoms (significant negative change)
bottoms = (price_changes < -0.003)
extrema_labels[bottoms] = 0
# Identify potential tops (significant positive change)
tops = (price_changes > 0.003)
extrema_labels[tops] = 1
# Calculate extrema prediction loss (auxiliary task)
if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
current_extrema_pred = current_extrema_pred[:min_size]
extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
# Combined loss (primary + auxiliary with lower weight)
# Typically auxiliary tasks should have lower weight to not dominate the primary task
loss = q_loss + 0.3 * extrema_loss
# Log separate loss components occasionally
if random.random() < 0.01: # Log 1% of the time to avoid flood
logger.info(f"Training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
else:
# Fall back to just Q-value loss if extrema predictions aren't available
loss = q_loss
except Exception as e:
# Fallback if price extraction fails
logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
loss = q_loss
# Backward pass and optimize
loss.backward()
# Gradient clipping to prevent exploding gradients
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
self.optimizer.step()
# Update target network if needed
self.update_count += 1
if self.update_count % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
"""Mixed precision training step for better GPU performance"""
# Check if mixed precision should be explicitly disabled
if 'DISABLE_MIXED_PRECISION' in os.environ:
logger.info("Mixed precision explicitly disabled by environment variable")
return self._replay_standard(states, actions, rewards, next_states, dones)
try:
# Zero gradients
self.optimizer.zero_grad()
# Forward pass with amp autocasting
with torch.cuda.amp.autocast():
# Get current Q values and extrema predictions
current_q_values, current_extrema_pred = self.policy_net(states)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
# Get next Q values from target network
with torch.no_grad():
next_q_values, next_extrema_pred = self.target_net(next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch and fix it
if rewards.shape[0] != next_q_values.shape[0]:
# Log the shape mismatch for debugging
logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index errors
min_size = min(rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
current_q_values = current_q_values[:min_size]
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute Q-value loss (primary task)
q_loss = nn.MSELoss()(current_q_values, target_q_values)
# Create extrema labels from price movements (crude approximation)
# Try to extract price from current and next states
try:
# Extract price feature from sequence data (if available)
if len(states.shape) == 3: # [batch, seq, features]
current_prices = states[:, -1, -1] # Last timestep, last feature
next_prices = next_states[:, -1, -1]
else: # [batch, features]
current_prices = states[:, -1] # Last feature
next_prices = next_states[:, -1]
# Compute price changes
price_changes = (next_prices - current_prices) / current_prices
# Create crude extrema labels:
# 0 = bottom: Large negative price change followed by positive change
# 1 = top: Large positive price change followed by negative change
# 2 = neither: Small or inconsistent changes
# Classify based on price change magnitude
extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2 # Default: neither
# Identify potential bottoms (significant negative change)
bottoms = (price_changes < -0.003)
extrema_labels[bottoms] = 0
# Identify potential tops (significant positive change)
tops = (price_changes > 0.003)
extrema_labels[tops] = 1
# Calculate extrema prediction loss (auxiliary task)
if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
current_extrema_pred = current_extrema_pred[:min_size]
extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
# Combined loss (primary + auxiliary with lower weight)
loss = q_loss + 0.3 * extrema_loss
# Log separate loss components occasionally
if random.random() < 0.01: # Log 1% of the time to avoid flood
logger.info(f"Mixed precision training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
else:
# Fall back to just Q-value loss
loss = q_loss
except Exception as e:
# Fallback if price extraction fails
logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
loss = q_loss
# Backward pass with scaled gradients
self.scaler.scale(loss).backward()
# Gradient clipping on scaled gradients
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
# Update with scaler
self.scaler.step(self.optimizer)
self.scaler.update()
# Update target network if needed
self.update_count += 1
if self.update_count % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
except Exception as e:
logger.error(f"Error in mixed precision training: {str(e)}")
logger.warning("Falling back to standard precision training")
# Fall back to standard training
return self._replay_standard(states, actions, rewards, next_states, dones)
def train_on_extrema(self, states, actions, rewards, next_states, dones):
"""
Special training function specifically for extrema points
Args:
states: Batch of states at extrema points
actions: Batch of actions
rewards: Batch of rewards
next_states: Batch of next states
dones: Batch of done flags
Returns:
float: Training loss
"""
# Convert to numpy arrays if not already
if not isinstance(states, np.ndarray):
states = np.array(states)
if not isinstance(actions, np.ndarray):
actions = np.array(actions)
if not isinstance(rewards, np.ndarray):
rewards = np.array(rewards)
if not isinstance(next_states, np.ndarray):
next_states = np.array(next_states)
if not isinstance(dones, np.ndarray):
dones = np.array(dones, dtype=np.float32)
# Normalize states
states = np.vstack([self._normalize_state(s) for s in states])
next_states = np.vstack([self._normalize_state(s) for s in next_states])
# Convert to torch tensors and move to device
states_tensor = torch.FloatTensor(states).to(self.device)
actions_tensor = torch.LongTensor(actions).to(self.device)
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states_tensor = torch.FloatTensor(next_states).to(self.device)
dones_tensor = torch.FloatTensor(dones).to(self.device)
# Choose training method based on precision mode
if self.use_mixed_precision:
return self._replay_mixed_precision(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
else:
return self._replay_standard(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
def _normalize_state(self, state: np.ndarray) -> np.ndarray:
"""Normalize the state data to prevent numerical issues"""
@ -211,148 +712,6 @@ class DQNAgent:
return normalized_state
def replay(self, use_prioritized=True) -> float:
"""
Train on a batch of experiences with prioritized sampling
Args:
use_prioritized: Whether to use prioritized replay
Returns:
float: Loss value
"""
if len(self.memory) < self.batch_size:
return 0.0
# Sample batch with prioritization
batch = []
if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
# Prioritized sampling from different memory types
positive_count = min(self.batch_size // 4, len(self.positive_memory))
extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
regular_count = self.batch_size - positive_count - extrema_count
positive_samples = random.sample(list(self.positive_memory), positive_count)
extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
regular_samples = random.sample(list(self.memory), regular_count)
batch = positive_samples + extrema_samples + regular_samples
else:
# Standard sampling
batch = random.sample(self.memory, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
# Normalize states before training
normalized_states = np.array([self._normalize_state(state) for state in states])
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
# Convert to tensors and move to device
states_tensor = torch.FloatTensor(normalized_states).to(self.device)
actions_tensor = torch.LongTensor(actions).to(self.device)
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
dones_tensor = torch.FloatTensor(dones).to(self.device)
# Get current Q values
current_q_values, extrema_pred = self.policy_net(states_tensor)
current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
# Get next Q values from target network (Double DQN approach)
with torch.no_grad():
# Get actions from policy network
next_actions, _ = self.policy_net(next_states_tensor)
next_actions = next_actions.max(1)[1].unsqueeze(1)
# Get Q values from target network for those actions
next_q_values, _ = self.target_net(next_states_tensor)
next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
# Compute target Q values
target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
# Clamp target values to prevent extreme values
target_q_values = torch.clamp(target_q_values, -100, 100)
# Compute Huber loss (more robust to outliers than MSE)
loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
# Optimize
self.optimizer.zero_grad()
loss.backward()
# Apply gradient clipping to prevent exploding gradients
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
self.optimizer.step()
# Update target network if needed
self.update_count += 1
if self.update_count % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
# Decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
def train_on_extrema(self, states, actions, rewards, next_states, dones):
"""
Special training method focused on extrema patterns
Args:
states: Array of states near extrema points
actions: Correct actions to take (buy at bottoms, sell at tops)
rewards: Rewards for each action
next_states: Next states
dones: Done flags
"""
if len(states) == 0:
return 0.0
# Normalize states
normalized_states = np.array([self._normalize_state(state) for state in states])
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
# Convert to tensors
states_tensor = torch.FloatTensor(normalized_states).to(self.device)
actions_tensor = torch.LongTensor(actions).to(self.device)
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
dones_tensor = torch.FloatTensor(dones).to(self.device)
# Forward pass
current_q_values, extrema_pred = self.policy_net(states_tensor)
current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
# Get next Q values (Double DQN approach)
with torch.no_grad():
next_actions, _ = self.policy_net(next_states_tensor)
next_actions = next_actions.max(1)[1].unsqueeze(1)
next_q_values, _ = self.target_net(next_states_tensor)
next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
# Clamp target values
target_q_values = torch.clamp(target_q_values, -100, 100)
# Use Huber loss for extrema training
q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
# Full loss
loss = q_loss
# Optimize
self.optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
self.optimizer.step()
return loss.item()
def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
"""Update learning metrics and perform learning rate adjustments if needed"""
# Update average reward with exponential moving average

View File

@ -74,6 +74,107 @@ class AdaptiveNorm(nn.Module):
self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
return self.layer_norm_1d(x)
class SimpleCNN(nn.Module):
"""
Simple CNN model for reinforcement learning with image-like state inputs
"""
def __init__(self, input_shape, n_actions):
super(SimpleCNN, self).__init__()
# Store dimensions
self.input_shape = input_shape
self.n_actions = n_actions
# Calculate input dimensions
if len(input_shape) == 3: # [channels, height, width]
self.channels, self.height, self.width = input_shape
self.feature_dim = self.height * self.width
elif len(input_shape) == 2: # [timeframes, features]
self.channels = input_shape[0]
self.features = input_shape[1]
self.feature_dim = self.features
elif len(input_shape) == 1: # [features]
self.channels = 1
self.features = input_shape[0]
self.feature_dim = self.features
else:
raise ValueError(f"Unsupported input shape: {input_shape}")
# Build network
self._build_network()
# Initialize device
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
logger.info(f"SimpleCNN initialized with input shape: {input_shape}, actions: {n_actions}")
def _build_network(self):
"""Build the neural network with current feature dimensions"""
# Create a flexible architecture that adapts to input dimensions
self.fc_layers = nn.Sequential(
nn.Linear(self.feature_dim, 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU()
)
# Output heads (Dueling DQN architecture)
self.advantage_head = nn.Linear(256, self.n_actions)
self.value_head = nn.Linear(256, 1)
# Extrema detection head
self.extrema_head = nn.Linear(256, 3) # 0=bottom, 1=top, 2=neither
def _check_rebuild_network(self, features):
"""Check if network needs to be rebuilt for different feature dimensions"""
if features != self.feature_dim:
logger.info(f"Rebuilding network for new feature dimension: {features} (was {self.feature_dim})")
self.feature_dim = features
self._build_network()
# Move to device after rebuilding
self.to(self.device)
return True
return False
def forward(self, x):
"""
Forward pass through the network
Returns both action values and extrema predictions
"""
# Handle different input shapes
if len(x.shape) == 2: # [batch_size, features]
# Simple feature vector
batch_size, features = x.shape
# Check if we need to rebuild the network for new dimensions
self._check_rebuild_network(features)
elif len(x.shape) == 3: # [batch_size, timeframes/channels, features]
# Reshape to flatten timeframes/channels with features
batch_size, timeframes, features = x.shape
total_features = timeframes * features
# Check if we need to rebuild the network for new dimensions
self._check_rebuild_network(total_features)
# Reshape tensor to [batch_size, total_features]
x = x.reshape(batch_size, total_features)
# Apply fully connected layers
fc_out = self.fc_layers(x)
# Dueling architecture
advantage = self.advantage_head(fc_out)
value = self.value_head(fc_out)
# Q-values = value + (advantage - mean(advantage))
action_values = value + advantage - advantage.mean(dim=1, keepdim=True)
# Extrema predictions
extrema_pred = self.extrema_head(fc_out)
return action_values, extrema_pred
class CNNModelPyTorch(nn.Module):
"""
CNN model for trading with multiple timeframes

70
NN/models/simple_mlp.py Normal file
View File

@ -0,0 +1,70 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import logging
# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SimpleMLP(nn.Module):
"""
Simple Multi-Layer Perceptron for reinforcement learning with vector state inputs
Implements dueling architecture for better Q-learning
"""
def __init__(self, state_dim, n_actions):
super(SimpleMLP, self).__init__()
# Store dimensions
self.state_dim = state_dim
self.n_actions = n_actions
# Calculate input size
if isinstance(state_dim, tuple):
self.input_size = int(np.prod(state_dim))
else:
self.input_size = state_dim
# Hidden layers
self.fc1 = nn.Linear(self.input_size, 256)
self.fc2 = nn.Linear(256, 256)
# Dueling architecture
self.advantage = nn.Linear(256, n_actions)
self.value = nn.Linear(256, 1)
# Extrema detection
self.extrema_head = nn.Linear(256, 3) # 0=bottom, 1=top, 2=neither
# Move to appropriate device
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
logger.info(f"SimpleMLP initialized with input size: {self.input_size}, actions: {n_actions}")
def forward(self, x):
"""
Forward pass through the network
Returns both action values and extrema predictions
"""
# Handle different input shapes
if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
x = x.view(-1, self.input_size)
# Main network
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
# Dueling architecture
advantage = self.advantage(x)
value = self.value(x)
# Combine value and advantage (Q = V + A - mean(A))
q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
# Extrema predictions
extrema = F.softmax(self.extrema_head(x), dim=1)
return q_values, extrema

View File

@ -29,6 +29,21 @@ logging.basicConfig(
]
)
# Set up device for PyTorch (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Log GPU status
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
logger.info(f"Using GPU: {gpu_names}")
# Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
logger.info("BFloat16 precision is supported - will use for faster training")
else:
logger.warning("GPU not available. Using CPU for training (slower).")
class RLTradingEnvironment(gym.Env):
"""
Reinforcement Learning environment for trading with technical indicators
@ -266,87 +281,151 @@ class RLTradingEnvironment(gym.Env):
def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent",
action_callback=None, episode_callback=None, symbol="BTC/USDT"):
"""
Train DQN agent for RL-based trading with extended training and monitoring
Train a reinforcement learning agent for trading
Args:
env_class: Optional environment class to use, defaults to RLTradingEnvironment
num_episodes: Number of episodes to train
env_class: Optional environment class override
num_episodes: Number of episodes to train for
max_steps: Maximum steps per episode
save_path: Path to save the model
action_callback: Optional callback for each action (step, action, price, reward, info)
episode_callback: Optional callback after each episode (episode, reward, info)
symbol: Trading pair symbol (e.g., "BTC/USDT")
save_path: Path to save the trained model
action_callback: Callback function for monitoring actions
episode_callback: Callback function for monitoring episodes
symbol: Trading symbol to use
Returns:
DQNAgent: The trained agent
tuple: (trained agent, environment)
"""
import pandas as pd
from NN.utils.data_interface import DataInterface
# Load data for the selected symbol
data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
logger.info("Starting DQN training for RL trading")
try:
# Try to load data for the requested symbol using get_historical_data method
data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
if data_1m is None or data_5m is None or data_15m is None:
raise FileNotFoundError("Could not retrieve data for specified symbol")
except Exception as e:
logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.")
# Try to use cached data if available
symbol = "BTC/USDT"
data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
if data_1m is None or data_5m is None or data_15m is None:
logger.error("Failed to retrieve any data. Cannot continue training.")
raise ValueError("No data available for training")
# Create data interface with specified symbol
data_interface = DataInterface(symbol=symbol)
# Load and preprocess data
logger.info(f"Loading data from multiple timeframes for {symbol}")
features_1m = data_interface.get_training_data("1m", n_candles=2000)
features_5m = data_interface.get_training_data("5m", n_candles=1000)
features_15m = data_interface.get_training_data("15m", n_candles=500)
# Check if we have all the data
if features_1m is None or features_5m is None or features_15m is None:
logger.error("Failed to load training data from one or more timeframes")
return None
# If data is a DataFrame, convert to numpy array excluding the timestamp column
if isinstance(features_1m, pd.DataFrame):
features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
if isinstance(features_5m, pd.DataFrame):
features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
if isinstance(features_15m, pd.DataFrame):
features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
# Initialize environment or use provided class
if env_class is None:
env = RLTradingEnvironment(features_1m, features_5m, features_15m)
# Create features from the data by adding technical indicators and converting to numpy format
if data_1m is not None:
data_1m = data_interface.add_technical_indicators(data_1m)
# Convert to numpy array with close price as the last column
features_1m = np.hstack([
data_1m.drop(['timestamp', 'close'], axis=1).values,
data_1m['close'].values.reshape(-1, 1)
])
else:
features_1m = None
if data_5m is not None:
data_5m = data_interface.add_technical_indicators(data_5m)
# Convert to numpy array with close price as the last column
features_5m = np.hstack([
data_5m.drop(['timestamp', 'close'], axis=1).values,
data_5m['close'].values.reshape(-1, 1)
])
else:
features_5m = None
if data_15m is not None:
data_15m = data_interface.add_technical_indicators(data_15m)
# Convert to numpy array with close price as the last column
features_15m = np.hstack([
data_15m.drop(['timestamp', 'close'], axis=1).values,
data_15m['close'].values.reshape(-1, 1)
])
else:
features_15m = None
# Check if we have all the required features
if features_1m is None or features_5m is None or features_15m is None:
logger.error("Failed to create features for all timeframes.")
raise ValueError("Could not create features for training")
# Create the environment
if env_class:
# Use provided environment class
env = env_class(features_1m, features_5m, features_15m)
else:
# Use the default environment
env = RLTradingEnvironment(features_1m, features_5m, features_15m)
# Set action callback if provided
if action_callback:
def step_callback(action, price, reward, info):
action_callback(env.current_step, action, price, reward, info)
env.set_action_callback(step_callback)
env.set_action_callback(action_callback)
# Initialize agent
window_size = env.window_size
num_features = env.num_features * env.num_timeframes
action_size = env.action_space.n
timeframes = ['1m', '5m', '15m'] # Match the timeframes from the environment
# Get environment properties for agent creation
input_shape = env.observation_space.shape
n_actions = env.action_space.n
# Create the agent
agent = DQNAgent(
state_size=window_size * num_features,
action_size=action_size,
window_size=window_size,
num_features=env.num_features,
timeframes=timeframes,
memory_size=100000,
batch_size=64,
state_shape=input_shape,
n_actions=n_actions,
epsilon=1.0,
epsilon_decay=0.995,
epsilon_min=0.01,
learning_rate=0.0001,
gamma=0.99,
epsilon=1.0,
epsilon_min=0.01,
epsilon_decay=0.995
buffer_size=10000,
batch_size=64,
device=device # Pass device to agent for GPU usage
)
# Training variables
best_reward = -float('inf')
episode_rewards = []
# Check if model file exists and load it
model_file = f"{save_path}_model.pth"
if os.path.exists(model_file):
try:
agent.load(model_file)
logger.info(f"Loaded existing model from {model_file}")
except Exception as e:
logger.error(f"Error loading model: {e}")
else:
logger.info("No existing model found. Starting with a new model.")
# TensorBoard writer for logging
writer = SummaryWriter(log_dir=f'runs/rl_trading_{int(time.time())}')
# Create TensorBoard writer
writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')
# Log GPU status to TensorBoard
writer.add_text("hardware/device", str(device), 0)
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)
# Training loop
total_rewards = []
trade_win_rates = []
best_reward = -np.inf
# Move models to the appropriate device if not already there
agent.move_models_to_device(device)
# Enable mixed precision if GPU and feature is available
use_mixed_precision = False
if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
logger.info("Enabling mixed precision training")
use_mixed_precision = True
scaler = torch.cuda.amp.GradScaler()
# Define step callback for tensorboard logging and model tracking
def step_callback(action, price, reward, info):
# Pass to external callback if provided
if action_callback:
action_callback(env.current_step, action, price, reward, info)
# Main training loop
logger.info(f"Starting training for {num_episodes} episodes...")
logger.info(f"Starting training on device: {agent.device}")
@ -378,12 +457,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
break
# Track rewards
episode_rewards.append(total_reward)
# Log progress
avg_reward = np.mean(episode_rewards[-100:])
logger.info(f"Episode {episode}/{num_episodes} - Reward: {total_reward:.4f}, " +
f"Avg (100): {avg_reward:.4f}, Epsilon: {agent.epsilon:.4f}")
total_rewards.append(total_reward)
# Calculate trading metrics
win_rate = env.win_rate if hasattr(env, 'win_rate') else 0
@ -391,15 +465,14 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
# Log to TensorBoard
writer.add_scalar('Reward/Episode', total_reward, episode)
writer.add_scalar('Reward/Average100', avg_reward, episode)
writer.add_scalar('Trade/WinRate', win_rate, episode)
writer.add_scalar('Trade/Count', trades, episode)
# Save best model
if avg_reward > best_reward and episode > 10:
logger.info(f"New best average reward: {avg_reward:.4f}, saving model")
if total_reward > best_reward and episode > 10:
logger.info(f"New best average reward: {total_reward:.4f}, saving model")
agent.save(save_path)
best_reward = avg_reward
best_reward = total_reward
# Periodic save every 100 episodes
if episode % 100 == 0 and episode > 0:
@ -424,7 +497,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
# Close TensorBoard writer
writer.close()
return agent
return agent, env
if __name__ == "__main__":
train_rl()