gogo2/NN/models/dqn_agent.py
2025-05-30 22:33:41 +03:00

1165 lines
57 KiB
Python

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
from typing import Tuple, List
import os
import sys
import logging
import torch.nn.functional as F
import time
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# Configure logger
logger = logging.getLogger(__name__)
class DQNAgent:
"""
Deep Q-Network agent for trading
Uses Enhanced CNN model as the base network with GPU support for improved performance
"""
def __init__(self,
state_shape: Tuple[int, ...],
n_actions: int = 2,
learning_rate: float = 0.001,
epsilon: float = 1.0,
epsilon_min: float = 0.01,
epsilon_decay: float = 0.995,
buffer_size: int = 10000,
batch_size: int = 32,
target_update: int = 100,
priority_memory: bool = True,
device=None):
# Extract state dimensions
if isinstance(state_shape, tuple) and len(state_shape) > 1:
# Multi-dimensional state (like image or sequence)
self.state_dim = state_shape
else:
# 1D state
if isinstance(state_shape, tuple):
self.state_dim = state_shape[0]
else:
self.state_dim = state_shape
# Store parameters
self.n_actions = n_actions
self.learning_rate = learning_rate
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.buffer_size = buffer_size
self.batch_size = batch_size
self.target_update = target_update
# Set device for computation (default to GPU if available)
if device is None:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
self.device = device
# Initialize models with Enhanced CNN architecture for better performance
from NN.models.enhanced_cnn import EnhancedCNN
# Use Enhanced CNN for both policy and target networks
self.policy_net = EnhancedCNN(self.state_dim, self.n_actions)
self.target_net = EnhancedCNN(self.state_dim, self.n_actions)
# Initialize the target network with the same weights as the policy network
self.target_net.load_state_dict(self.policy_net.state_dict())
# Set models to eval mode (important for batch norm, dropout)
self.target_net.eval()
# Optimization components
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
self.criterion = nn.MSELoss()
# Experience replay memory
self.memory = []
self.positive_memory = [] # Special memory for storing good experiences
self.update_count = 0
# Extrema detection tracking
self.last_extrema_pred = {
'class': 2, # Default to "neither" (not extrema)
'confidence': 0.0,
'raw': None
}
self.extrema_memory = [] # Special memory for storing extrema points
# Price prediction tracking
self.last_price_pred = {
'immediate': {
'direction': 1, # Default to "sideways"
'confidence': 0.0,
'change': 0.0
},
'midterm': {
'direction': 1, # Default to "sideways"
'confidence': 0.0,
'change': 0.0
},
'longterm': {
'direction': 1, # Default to "sideways"
'confidence': 0.0,
'change': 0.0
}
}
# Store separate memory for price direction examples
self.price_movement_memory = [] # For storing examples of clear price movements
# Performance tracking
self.losses = []
self.avg_reward = 0.0
self.best_reward = -float('inf')
self.no_improvement_count = 0
# Confidence tracking
self.confidence_history = []
self.avg_confidence = 0.0
self.max_confidence = 0.0
self.min_confidence = 1.0
# Enhanced features from EnhancedDQNAgent
# Market adaptation capabilities
self.market_regime_weights = {
'trending': 1.2, # Higher confidence in trending markets
'ranging': 0.8, # Lower confidence in ranging markets
'volatile': 0.6 # Much lower confidence in volatile markets
}
# Dueling network support (requires enhanced network architecture)
self.use_dueling = True
# Prioritized experience replay parameters
self.use_prioritized_replay = priority_memory
self.alpha = 0.6 # Priority exponent
self.beta = 0.4 # Importance sampling exponent
self.beta_increment = 0.001
# Double DQN support
self.use_double_dqn = True
# Enhanced training features from EnhancedDQNAgent
self.target_update_freq = target_update # More descriptive name
self.training_steps = 0
self.gradient_clip_norm = 1.0 # Gradient clipping
# Enhanced statistics tracking
self.epsilon_history = []
self.td_errors = [] # Track TD errors for analysis
# Trade action fee and confidence thresholds
self.trade_action_fee = 0.0005 # Small fee to discourage unnecessary trading
self.minimum_action_confidence = 0.3 # Minimum confidence to consider trading (lowered from 0.5)
self.recent_actions = deque(maxlen=10)
self.recent_prices = deque(maxlen=20)
self.recent_rewards = deque(maxlen=100)
# Violent move detection
self.price_history = []
self.volatility_window = 20 # Window size for volatility calculation
self.volatility_threshold = 0.0015 # Threshold for considering a move "violent"
self.post_violent_move = False # Flag for recent violent move
self.violent_move_cooldown = 0 # Cooldown after violent move
# Feature integration
self.last_hidden_features = None # Store last extracted features
self.feature_history = [] # Store history of features for analysis
# Real-time tick features integration
self.realtime_tick_features = None # Latest tick features from tick processor
self.tick_feature_weight = 0.3 # Weight for tick features in decision making
# Check if mixed precision training should be used
self.use_mixed_precision = False
if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
self.use_mixed_precision = True
self.scaler = torch.cuda.amp.GradScaler()
logger.info("Mixed precision training enabled")
else:
logger.info("Mixed precision training disabled")
# Track if we're in training mode
self.training = True
# For compatibility with old code
self.state_size = np.prod(state_shape)
self.action_size = n_actions
self.memory_size = buffer_size
self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3] # Default timeframes
logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}")
logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}")
logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}")
# Log model parameters
total_params = sum(p.numel() for p in self.policy_net.parameters())
logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters")
# Position management for 2-action system
self.current_position = 0.0 # -1 (short), 0 (neutral), 1 (long)
self.position_entry_price = 0.0
self.position_entry_time = None
# Different thresholds for entry vs exit decisions
self.entry_confidence_threshold = 0.7 # High threshold for new positions
self.exit_confidence_threshold = 0.3 # Lower threshold for closing positions
self.uncertainty_threshold = 0.1 # When to stay neutral
def move_models_to_device(self, device=None):
"""Move models to the specified device (GPU/CPU)"""
if device is not None:
self.device = device
try:
self.policy_net = self.policy_net.to(self.device)
self.target_net = self.target_net.to(self.device)
logger.info(f"Moved models to {self.device}")
return True
except Exception as e:
logger.error(f"Failed to move models to {self.device}: {str(e)}")
return False
def remember(self, state: np.ndarray, action: int, reward: float,
next_state: np.ndarray, done: bool, is_extrema: bool = False):
"""
Store experience in memory with prioritization
Args:
state: Current state
action: Action taken
reward: Reward received
next_state: Next state
done: Whether episode is done
is_extrema: Whether this is a local extrema sample (for specialized learning)
"""
experience = (state, action, reward, next_state, done)
# Always add to main memory
self.memory.append(experience)
# Try to extract price change to analyze the experience
try:
# Extract price feature from sequence data (if available)
if len(state.shape) > 1: # 2D state [timeframes, features]
current_price = state[-1, -1] # Last timeframe, last feature
next_price = next_state[-1, -1]
else: # 1D state
current_price = state[-1] # Last feature
next_price = next_state[-1]
# Calculate price change - avoid division by zero
if np.isscalar(current_price) and current_price != 0:
price_change = (next_price - current_price) / current_price
elif isinstance(current_price, np.ndarray):
# Handle array case - protect against division by zero
with np.errstate(divide='ignore', invalid='ignore'):
price_change = (next_price - current_price) / current_price
# Replace infinities and NaNs with zeros
if isinstance(price_change, np.ndarray):
price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0)
else:
price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change
else:
price_change = 0.0
# Check if this is a significant price movement
if abs(price_change) > 0.002: # Significant price change
# Store in price movement memory
self.price_movement_memory.append(experience)
# Log significant price movements
direction = "UP" if price_change > 0 else "DOWN"
logger.info(f"Stored significant {direction} price movement: {price_change:.4f}")
# For clear price movements, also duplicate in main memory to learn more
if abs(price_change) > 0.005: # Very significant movement
for _ in range(2): # Add 2 extra copies
self.memory.append(experience)
except Exception as e:
# Skip price movement analysis if it fails
pass
# Check if this is an extrema point based on our extrema detection head
if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
# Class 0 = bottom, 1 = top, 2 = neither
# Only consider high confidence predictions
if self.last_extrema_pred['confidence'] > 0.7:
self.extrema_memory.append(experience)
# Log this special experience
extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")
# For tops and bottoms, also duplicate the experience in memory to learn more from it
for _ in range(2): # Add 2 extra copies
self.memory.append(experience)
# Explicitly marked extrema points also go to extrema memory
elif is_extrema:
self.extrema_memory.append(experience)
# Store positive experiences separately for prioritized replay
if reward > 0:
self.positive_memory.append(experience)
# For very good rewards, duplicate to learn more from them
if reward > 0.1:
for _ in range(min(int(reward * 10), 5)): # Cap at 5 extra copies for very high rewards
self.positive_memory.append(experience)
# Keep memory size under control
if len(self.memory) > self.buffer_size:
# Keep more recent experiences
self.memory = self.memory[-self.buffer_size:]
# Keep specialized memories under control too
if len(self.positive_memory) > self.buffer_size // 4:
self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]
if len(self.extrema_memory) > self.buffer_size // 4:
self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]
if len(self.price_movement_memory) > self.buffer_size // 4:
self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):]
def act(self, state: np.ndarray, explore=True, current_price=None, market_context=None) -> int:
"""
Choose action based on current state using 2-action system with intelligent position management
Args:
state: Current market state
explore: Whether to use epsilon-greedy exploration
current_price: Current market price for position management
market_context: Additional market context for decision making
Returns:
int: Action (0=SELL, 1=BUY) or None if should hold position
"""
# Convert state to tensor
if isinstance(state, np.ndarray):
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
else:
state_tensor = state.unsqueeze(0).to(self.device)
# Get Q-values
q_values = self.policy_net(state_tensor)
action_values = q_values.cpu().data.numpy()[0]
# Calculate confidence scores
sell_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
buy_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
# Determine action based on current position and confidence thresholds
action = self._determine_action_with_position_management(
sell_confidence, buy_confidence, current_price, market_context, explore
)
# Update tracking
if current_price:
self.recent_prices.append(current_price)
if action is not None:
self.recent_actions.append(action)
return action
else:
# Return None to indicate HOLD (don't change position)
return None
def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float]:
"""Choose action with confidence score adapted to market regime (from Enhanced DQN)"""
with torch.no_grad():
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
q_values = self.policy_net(state_tensor)
# Convert Q-values to probabilities
action_probs = torch.softmax(q_values, dim=1)
action = q_values.argmax().item()
base_confidence = action_probs[0, action].item()
# Adapt confidence based on market regime
regime_weight = self.market_regime_weights.get(market_regime, 1.0)
adapted_confidence = min(base_confidence * regime_weight, 1.0)
return action, adapted_confidence
def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore):
"""
Determine action based on current position and confidence thresholds
This implements the intelligent position management where:
- When neutral: Need high confidence to enter position
- When in position: Need lower confidence to exit
- Different thresholds for entry vs exit
"""
# Apply epsilon-greedy exploration
if explore and np.random.random() <= self.epsilon:
return np.random.choice([0, 1])
# Get the dominant signal
dominant_action = 0 if sell_conf > buy_conf else 1
dominant_confidence = max(sell_conf, buy_conf)
# Decision logic based on current position
if self.current_position == 0: # No position - need high confidence to enter
if dominant_confidence >= self.entry_confidence_threshold:
# Strong enough signal to enter position
if dominant_action == 1: # BUY signal
self.current_position = 1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
return 1
else: # SELL signal
self.current_position = -1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
return 0
else:
# Not confident enough to enter position
return None
elif self.current_position > 0: # Long position
if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold:
# SELL signal with enough confidence to close long position
pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 0.0
self.position_entry_price = 0.0
self.position_entry_time = None
return 0
elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold:
# Very strong SELL signal - close long and enter short
pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = -1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
return 0
else:
# Hold the long position
return None
elif self.current_position < 0: # Short position
if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold:
# BUY signal with enough confidence to close short position
pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 0.0
self.position_entry_price = 0.0
self.position_entry_time = None
return 1
elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold:
# Very strong BUY signal - close short and enter long
pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
return 1
else:
# Hold the short position
return None
return None
def replay(self, experiences=None):
"""Train the model using experiences from memory"""
# Don't train if not in training mode
if not self.training:
return 0.0
# If no experiences provided, sample from memory
if experiences is None:
# Skip if memory is too small
if len(self.memory) < self.batch_size:
return 0.0
# Sample random mini-batch from memory
indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
experiences = [self.memory[i] for i in indices]
# Choose appropriate replay method
if self.use_mixed_precision:
# Convert experiences to tensors for mixed precision
states = torch.FloatTensor(np.array([e[0] for e in experiences])).to(self.device)
actions = torch.LongTensor(np.array([e[1] for e in experiences])).to(self.device)
rewards = torch.FloatTensor(np.array([e[2] for e in experiences])).to(self.device)
next_states = torch.FloatTensor(np.array([e[3] for e in experiences])).to(self.device)
dones = torch.FloatTensor(np.array([e[4] for e in experiences])).to(self.device)
# Use mixed precision replay
loss = self._replay_mixed_precision(states, actions, rewards, next_states, dones)
else:
# Pass experiences directly to standard replay method
loss = self._replay_standard(experiences)
# Store loss for monitoring
self.losses.append(loss)
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# Randomly decide if we should train on extrema points from special memory
if random.random() < 0.3 and len(self.extrema_memory) >= self.batch_size:
# Train specifically on extrema memory examples
extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False)
extrema_batch = [self.extrema_memory[i] for i in extrema_indices]
# Extract tensors from extrema batch
extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device)
extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device)
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device)
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device)
extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device)
# Use a slightly reduced learning rate for extrema training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
# Train on extrema memory
if self.use_mixed_precision:
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
else:
extrema_loss = self._replay_standard(extrema_batch)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log extrema loss
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
# Randomly train on price movement examples (similar to extrema)
if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size:
# Train specifically on price movement memory examples
price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False)
price_batch = [self.price_movement_memory[i] for i in price_indices]
# Extract tensors from price movement batch
price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device)
price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device)
price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device)
price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device)
price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device)
# Use a slightly reduced learning rate for price movement training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
# Train on price movement memory
if self.use_mixed_precision:
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
else:
price_loss = self._replay_standard(price_batch)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log price movement loss
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
return loss
def _replay_standard(self, experiences=None):
"""Standard training step without mixed precision"""
try:
# Use experiences if provided, otherwise sample from memory
if experiences is None:
# If memory is too small, skip training
if len(self.memory) < self.batch_size:
return 0.0
# Sample random mini-batch from memory
indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
batch = [self.memory[i] for i in indices]
experiences = batch
# Unpack experiences
states, actions, rewards, next_states, dones = zip(*experiences)
# Convert to PyTorch tensors
states = torch.FloatTensor(np.array(states)).to(self.device)
actions = torch.LongTensor(np.array(actions)).to(self.device)
rewards = torch.FloatTensor(np.array(rewards)).to(self.device)
next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
dones = torch.FloatTensor(np.array(dones)).to(self.device)
# Get current Q values
current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
# Enhanced Double DQN implementation
with torch.no_grad():
if self.use_double_dqn:
# Double DQN: Use policy network to select actions, target network to evaluate
policy_q_values, _, _, _, _ = self.policy_net(next_states)
next_actions = policy_q_values.argmax(1)
target_q_values_all, _, _, _, _ = self.target_net(next_states)
next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
else:
# Standard DQN: Use target network for both selection and evaluation
next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch between rewards and next_q_values
if rewards.shape[0] != next_q_values.shape[0]:
logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index error
min_size = min(rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
current_q_values = current_q_values[:min_size]
# Calculate target Q values
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute loss for Q value
q_loss = self.criterion(current_q_values, target_q_values)
# Try to compute extrema loss if possible
try:
# Get the target classes from extrema predictions
extrema_targets = torch.argmax(current_extrema_pred, dim=1).long()
# Compute extrema loss using cross-entropy - this is an auxiliary task
extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets)
# Combined loss with emphasis on Q-learning
total_loss = q_loss + 0.1 * extrema_loss
except Exception as e:
logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
total_loss = q_loss
# Reset gradients
self.optimizer.zero_grad()
# Backward pass
total_loss.backward()
# Enhanced gradient clipping with configurable norm
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.gradient_clip_norm)
# Update weights
self.optimizer.step()
# Enhanced target network update tracking
self.training_steps += 1
if self.training_steps % self.target_update_freq == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
logger.debug(f"Target network updated at step {self.training_steps}")
# Enhanced statistics tracking
self.epsilon_history.append(self.epsilon)
# Calculate and store TD error for analysis
with torch.no_grad():
td_error = torch.abs(current_q_values - target_q_values).mean().item()
self.td_errors.append(td_error)
# Return loss
return total_loss.item()
except Exception as e:
logger.error(f"Error in replay standard: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return 0.0
def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
"""Mixed precision training step for better GPU performance"""
# Check if mixed precision should be explicitly disabled
if 'DISABLE_MIXED_PRECISION' in os.environ:
logger.info("Mixed precision explicitly disabled by environment variable")
return self._replay_standard(states, actions, rewards, next_states, dones)
try:
# Zero gradients
self.optimizer.zero_grad()
# Forward pass with amp autocasting
with torch.cuda.amp.autocast():
# Get current Q values and extrema predictions
current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
# Get next Q values from target network
with torch.no_grad():
next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch and fix it
if rewards.shape[0] != next_q_values.shape[0]:
# Log the shape mismatch for debugging
logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index errors
min_size = min(rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
current_q_values = current_q_values[:min_size]
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute Q-value loss (primary task)
q_loss = nn.MSELoss()(current_q_values, target_q_values)
# Initialize loss with q_loss
loss = q_loss
# Try to extract price from current and next states
try:
# Extract price feature from sequence data (if available)
if len(states.shape) == 3: # [batch, seq, features]
current_prices = states[:, -1, -1] # Last timestep, last feature
next_prices = next_states[:, -1, -1]
else: # [batch, features]
current_prices = states[:, -1] # Last feature
next_prices = next_states[:, -1]
# Calculate price change for different timeframes
immediate_changes = (next_prices - current_prices) / current_prices
# Get the actual batch size for this calculation
actual_batch_size = states.shape[0]
# Create price direction labels - simplified for training
# 0 = down, 1 = sideways, 2 = up
immediate_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Default: sideways
midterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1
longterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1
# Immediate term direction (1s, 1m)
immediate_up = (immediate_changes > 0.0005)
immediate_down = (immediate_changes < -0.0005)
immediate_labels[immediate_up] = 2 # Up
immediate_labels[immediate_down] = 0 # Down
# For mid and long term, we can only approximate during training
# In a real system, we'd need historical data to validate these
# Here we'll use the immediate term with increasing thresholds as approximation
# Mid-term (1h) - use slightly higher threshold
midterm_up = (immediate_changes > 0.001)
midterm_down = (immediate_changes < -0.001)
midterm_labels[midterm_up] = 2 # Up
midterm_labels[midterm_down] = 0 # Down
# Long-term (1d) - use even higher threshold
longterm_up = (immediate_changes > 0.002)
longterm_down = (immediate_changes < -0.002)
longterm_labels[longterm_up] = 2 # Up
longterm_labels[longterm_down] = 0 # Down
# Generate target values for price change regression
# For simplicity, we'll use the immediate change and scaled versions for longer timeframes
price_value_targets = torch.zeros((actual_batch_size, 4), device=self.device)
price_value_targets[:, 0] = immediate_changes
price_value_targets[:, 1] = immediate_changes * 2.0 # Approximate 1h change
price_value_targets[:, 2] = immediate_changes * 4.0 # Approximate 1d change
price_value_targets[:, 3] = immediate_changes * 6.0 # Approximate 1w change
# Calculate loss for price direction prediction (classification)
if len(current_price_pred['immediate'].shape) > 1 and current_price_pred['immediate'].shape[0] >= actual_batch_size:
# Slice predictions to match the adjusted batch size
immediate_pred = current_price_pred['immediate'][:actual_batch_size]
midterm_pred = current_price_pred['midterm'][:actual_batch_size]
longterm_pred = current_price_pred['longterm'][:actual_batch_size]
price_values_pred = current_price_pred['values'][:actual_batch_size]
# Compute losses for each task
immediate_loss = nn.CrossEntropyLoss()(immediate_pred, immediate_labels)
midterm_loss = nn.CrossEntropyLoss()(midterm_pred, midterm_labels)
longterm_loss = nn.CrossEntropyLoss()(longterm_pred, longterm_labels)
# MSE loss for price value regression
price_value_loss = nn.MSELoss()(price_values_pred, price_value_targets)
# Combine all price prediction losses
price_loss = immediate_loss + 0.7 * midterm_loss + 0.5 * longterm_loss + 0.3 * price_value_loss
# Create extrema labels (same as before)
extrema_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 2 # Default: neither
# Identify potential bottoms (significant negative change)
bottoms = (immediate_changes < -0.003)
extrema_labels[bottoms] = 0
# Identify potential tops (significant positive change)
tops = (immediate_changes > 0.003)
extrema_labels[tops] = 1
# Calculate extrema prediction loss
if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= actual_batch_size:
current_extrema_pred = current_extrema_pred[:actual_batch_size]
extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
# Combined loss with all components
# Primary task: Q-value learning (RL objective)
# Secondary tasks: extrema detection and price prediction (supervised objectives)
loss = q_loss + 0.3 * extrema_loss + 0.3 * price_loss
# Log loss components occasionally
if random.random() < 0.01: # Log 1% of the time
logger.info(
f"Mixed precision losses: Q-loss={q_loss.item():.4f}, "
f"Extrema-loss={extrema_loss.item():.4f}, "
f"Price-loss={price_loss.item():.4f}"
)
except Exception as e:
# Fallback if price extraction fails
logger.warning(f"Failed to calculate price prediction loss: {str(e)}. Using only Q-value loss.")
# Just use Q-value loss
loss = q_loss
# Backward pass with scaled gradients
self.scaler.scale(loss).backward()
# Gradient clipping on scaled gradients
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
# Update with scaler
self.scaler.step(self.optimizer)
self.scaler.update()
# Update target network if needed
self.update_count += 1
if self.update_count % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
except Exception as e:
logger.error(f"Error in mixed precision training: {str(e)}")
logger.warning("Falling back to standard precision training")
# Fall back to standard training
return self._replay_standard(states, actions, rewards, next_states, dones)
def train_on_extrema(self, states, actions, rewards, next_states, dones):
"""
Special training function specifically for extrema points
Args:
states: Batch of states at extrema points
actions: Batch of actions
rewards: Batch of rewards
next_states: Batch of next states
dones: Batch of done flags
Returns:
float: Training loss
"""
# Convert to numpy arrays if not already
if not isinstance(states, np.ndarray):
states = np.array(states)
if not isinstance(actions, np.ndarray):
actions = np.array(actions)
if not isinstance(rewards, np.ndarray):
rewards = np.array(rewards)
if not isinstance(next_states, np.ndarray):
next_states = np.array(next_states)
if not isinstance(dones, np.ndarray):
dones = np.array(dones, dtype=np.float32)
# Normalize states
states = np.vstack([self._normalize_state(s) for s in states])
next_states = np.vstack([self._normalize_state(s) for s in next_states])
# Convert to torch tensors and move to device
states_tensor = torch.FloatTensor(states).to(self.device)
actions_tensor = torch.LongTensor(actions).to(self.device)
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
next_states_tensor = torch.FloatTensor(next_states).to(self.device)
dones_tensor = torch.FloatTensor(dones).to(self.device)
# Choose training method based on precision mode
if self.use_mixed_precision:
return self._replay_mixed_precision(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
else:
return self._replay_standard(
states_tensor, actions_tensor, rewards_tensor,
next_states_tensor, dones_tensor
)
def _normalize_state(self, state: np.ndarray) -> np.ndarray:
"""Normalize the state data to prevent numerical issues"""
# Handle NaN and infinite values
state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
# Check if state is 1D array (happens in some environments)
if len(state.shape) == 1:
# If 1D, we need to normalize the whole array
normalized_state = state.copy()
# Convert any timestamp or non-numeric data to float
for i in range(len(normalized_state)):
# Check for timestamp-like objects
if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
# Convert timestamp to float (seconds since epoch)
normalized_state[i] = float(normalized_state[i].timestamp())
elif not isinstance(normalized_state[i], (int, float, np.number)):
# Set non-numeric data to 0
normalized_state[i] = 0.0
# Ensure all values are float
normalized_state = normalized_state.astype(np.float32)
# Simple min-max normalization for 1D state
state_min = np.min(normalized_state)
state_max = np.max(normalized_state)
if state_max > state_min:
normalized_state = (normalized_state - state_min) / (state_max - state_min)
return normalized_state
# Handle 2D arrays
normalized_state = np.zeros_like(state, dtype=np.float32)
# Convert any timestamp or non-numeric data to float
for i in range(state.shape[0]):
for j in range(state.shape[1]):
if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
# Convert timestamp to float (seconds since epoch)
normalized_state[i, j] = float(state[i, j].timestamp())
elif isinstance(state[i, j], (int, float, np.number)):
normalized_state[i, j] = state[i, j]
else:
# Set non-numeric data to 0
normalized_state[i, j] = 0.0
# Loop through each timeframe's features in the combined state
feature_count = state.shape[1] // len(self.timeframes)
for tf_idx in range(len(self.timeframes)):
start_idx = tf_idx * feature_count
end_idx = start_idx + feature_count
# Extract this timeframe's features
tf_features = normalized_state[:, start_idx:end_idx]
# Normalize OHLCV data by the first close price in the window
# This makes price movements relative rather than absolute
price_idx = 3 # Assuming close price is at index 3
if price_idx < tf_features.shape[1]:
reference_price = np.mean(tf_features[:, price_idx])
if reference_price != 0:
# Normalize price-related columns (OHLC)
for i in range(4): # First 4 columns are OHLC
if i < tf_features.shape[1]:
normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price
# Normalize volume using mean and std
vol_idx = 4 # Assuming volume is at index 4
if vol_idx < tf_features.shape[1]:
vol_mean = np.mean(tf_features[:, vol_idx])
vol_std = np.std(tf_features[:, vol_idx])
if vol_std > 0:
normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
else:
normalized_state[:, start_idx + vol_idx] = 0
# Other features (technical indicators) - normalize with min-max scaling
for i in range(5, feature_count):
if i < tf_features.shape[1]:
feature_min = np.min(tf_features[:, i])
feature_max = np.max(tf_features[:, i])
if feature_max > feature_min:
normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
else:
normalized_state[:, start_idx + i] = 0
return normalized_state
def update_realtime_tick_features(self, tick_features):
"""Update with real-time tick features from tick processor"""
try:
if tick_features is not None:
self.realtime_tick_features = tick_features
# Log high-confidence tick features
if tick_features.get('confidence', 0) > 0.8:
logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}")
except Exception as e:
logger.error(f"Error updating real-time tick features: {e}")
def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray:
"""Enhance state with real-time tick features if available"""
try:
if self.realtime_tick_features is None:
return state
# Extract neural features from tick processor
neural_features = self.realtime_tick_features.get('neural_features', np.array([]))
volume_features = self.realtime_tick_features.get('volume_features', np.array([]))
microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([]))
confidence = self.realtime_tick_features.get('confidence', 0.0)
# Combine tick features - make them compact to match state dimensions
tick_features = np.concatenate([
neural_features[:3] if len(neural_features) >= 3 else np.zeros(3), # Take first 3 neural features
volume_features[:1] if len(volume_features) >= 1 else np.zeros(1), # Take first volume feature
microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1), # Take first microstructure feature
])
# Weight the tick features
weighted_tick_features = tick_features * self.tick_feature_weight
# Enhance the state by adding tick features to each timeframe
if len(state.shape) == 1:
# 1D state - append tick features
enhanced_state = np.concatenate([state, weighted_tick_features])
else:
# 2D state - add tick features to each timeframe row
num_timeframes, num_features = state.shape
# Ensure tick features match the number of original features
if len(weighted_tick_features) != num_features:
# Pad or truncate tick features to match state feature dimension
if len(weighted_tick_features) < num_features:
# Pad with zeros
padded_features = np.zeros(num_features)
padded_features[:len(weighted_tick_features)] = weighted_tick_features
weighted_tick_features = padded_features
else:
# Truncate to match
weighted_tick_features = weighted_tick_features[:num_features]
# Add tick features to the last row (most recent timeframe)
enhanced_state = state.copy()
enhanced_state[-1, :] += weighted_tick_features # Add to last timeframe
return enhanced_state
except Exception as e:
logger.error(f"Error enhancing state with tick features: {e}")
return state
def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
"""Update learning metrics and perform learning rate adjustments if needed"""
# Update average reward with exponential moving average
if self.avg_reward == 0:
self.avg_reward = episode_reward
else:
self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward
# Check if we're making sufficient progress
if episode_reward > (1 + best_reward_threshold) * self.best_reward:
self.best_reward = episode_reward
self.no_improvement_count = 0
return True # Improved
else:
self.no_improvement_count += 1
# If no improvement for a while, adjust learning rate
if self.no_improvement_count >= 10:
current_lr = self.optimizer.param_groups[0]['lr']
new_lr = current_lr * 0.5
if new_lr >= 1e-6: # Don't reduce below minimum threshold
for param_group in self.optimizer.param_groups:
param_group['lr'] = new_lr
logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
self.no_improvement_count = 0
return False # No improvement
def save(self, path: str):
"""Save model and agent state"""
os.makedirs(os.path.dirname(path), exist_ok=True)
# Save policy network
self.policy_net.save(f"{path}_policy")
# Save target network
self.target_net.save(f"{path}_target")
# Save agent state
state = {
'epsilon': self.epsilon,
'update_count': self.update_count,
'losses': self.losses,
'optimizer_state': self.optimizer.state_dict(),
'best_reward': self.best_reward,
'avg_reward': self.avg_reward
}
torch.save(state, f"{path}_agent_state.pt")
logger.info(f"Agent state saved to {path}_agent_state.pt")
def load(self, path: str):
"""Load model and agent state"""
# Load policy network
self.policy_net.load(f"{path}_policy")
# Load target network
self.target_net.load(f"{path}_target")
# Load agent state
try:
agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
self.epsilon = agent_state['epsilon']
self.update_count = agent_state['update_count']
self.losses = agent_state['losses']
self.optimizer.load_state_dict(agent_state['optimizer_state'])
# Load additional metrics if they exist
if 'best_reward' in agent_state:
self.best_reward = agent_state['best_reward']
if 'avg_reward' in agent_state:
self.avg_reward = agent_state['avg_reward']
logger.info(f"Agent state loaded from {path}_agent_state.pt")
except FileNotFoundError:
logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")
def get_position_info(self):
"""Get current position information"""
return {
'position': self.current_position,
'entry_price': self.position_entry_price,
'entry_time': self.position_entry_time,
'entry_threshold': self.entry_confidence_threshold,
'exit_threshold': self.exit_confidence_threshold
}
def get_enhanced_training_stats(self):
"""Get enhanced RL training statistics with detailed metrics (from EnhancedDQNAgent)"""
return {
'buffer_size': len(self.memory),
'epsilon': self.epsilon,
'avg_reward': self.avg_reward,
'best_reward': self.best_reward,
'recent_rewards': list(self.recent_rewards) if hasattr(self, 'recent_rewards') else [],
'no_improvement_count': self.no_improvement_count,
# Enhanced statistics from EnhancedDQNAgent
'training_steps': self.training_steps,
'avg_td_error': np.mean(self.td_errors[-100:]) if self.td_errors else 0.0,
'recent_losses': self.losses[-10:] if self.losses else [],
'epsilon_trend': self.epsilon_history[-20:] if self.epsilon_history else [],
'specialized_buffers': {
'extrema_memory': len(self.extrema_memory),
'positive_memory': len(self.positive_memory),
'price_movement_memory': len(self.price_movement_memory)
},
'market_regime_weights': self.market_regime_weights,
'use_double_dqn': self.use_double_dqn,
'use_prioritized_replay': self.use_prioritized_replay,
'gradient_clip_norm': self.gradient_clip_norm,
'target_update_frequency': self.target_update_freq
}