fix model mappings,dash updates, trading

This commit is contained in:
Dobromir Popov
2025-07-22 15:44:59 +03:00
parent 3e35b9cddb
commit 1a54fb1d56
32 changed files with 6168 additions and 857 deletions

View File

@ -578,7 +578,7 @@ class DQNAgent:
market_context: Additional market context for decision making
Returns:
int: Action (0=SELL, 1=BUY) or None if should hold position
int: Action (0=BUY, 1=SELL, 2=HOLD) or None if should hold position
"""
# Convert state to tensor
@ -602,8 +602,9 @@ class DQNAgent:
if q_values.dim() == 1:
q_values = q_values.unsqueeze(0)
sell_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
buy_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
# FIXED ACTION MAPPING: 0=BUY, 1=SELL, 2=HOLD
buy_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
sell_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
# Determine action based on current position and confidence thresholds
action = self._determine_action_with_position_management(
@ -669,68 +670,68 @@ class DQNAgent:
if explore and np.random.random() <= self.epsilon:
return np.random.choice([0, 1])
# Get the dominant signal
dominant_action = 0 if sell_conf > buy_conf else 1
dominant_confidence = max(sell_conf, buy_conf)
# Get the dominant signal - FIXED ACTION MAPPING: 0=BUY, 1=SELL
dominant_action = 0 if buy_conf > sell_conf else 1
dominant_confidence = max(buy_conf, sell_conf)
# Decision logic based on current position
if self.current_position == 0: # No position - need high confidence to enter
if dominant_confidence >= self.entry_confidence_threshold:
# Strong enough signal to enter position
if dominant_action == 1: # BUY signal
if dominant_action == 0: # BUY signal (action 0)
self.current_position = 1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
return 1
else: # SELL signal
return 0 # Return BUY action (0)
else: # SELL signal (action 1)
self.current_position = -1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
return 0
return 1 # Return SELL action (1)
else:
# Not confident enough to enter position
return None
elif self.current_position > 0: # Long position
if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold:
# SELL signal with enough confidence to close long position
if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold:
# SELL signal (action 1) with enough confidence to close long position
pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 0.0
self.position_entry_price = 0.0
self.position_entry_time = None
return 0
elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold:
return 1 # Return SELL action (1)
elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold:
# Very strong SELL signal - close long and enter short
pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = -1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
return 0
return 1 # Return SELL action (1)
else:
# Hold the long position
return None
elif self.current_position < 0: # Short position
if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold:
# BUY signal with enough confidence to close short position
if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold:
# BUY signal (action 0) with enough confidence to close short position
pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 0.0
self.position_entry_price = 0.0
self.position_entry_time = None
return 1
elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold:
return 0 # Return BUY action (0)
elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold:
# Very strong BUY signal - close short and enter long
pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
self.current_position = 1.0
self.position_entry_price = current_price
self.position_entry_time = time.time()
return 1
return 0 # Return BUY action (0)
else:
# Hold the short position
return None
@ -792,246 +793,157 @@ class DQNAgent:
indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
experiences = [self.memory[i] for i in indices]
# Sanitize and stack states and next_states
sanitized_states = []
sanitized_next_states = []
sanitized_experiences = []
# Validate experiences before processing
if not experiences or len(experiences) == 0:
logger.warning("No experiences provided for training")
return 0.0
for i, e in enumerate(experiences):
try:
# Extract experience components
state, action, reward, next_state, done = e
# Sanitize state - convert any dict/object to float arrays
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
# Sanitize action - ensure it's an integer
if isinstance(action, dict):
# If action is a dict, try to extract action value
action = action.get('action', action.get('value', 0))
action = int(action) if not isinstance(action, (int, np.integer)) else action
# Sanitize reward - ensure it's a float
if isinstance(reward, dict):
# If reward is a dict, try to extract reward value
reward = reward.get('reward', reward.get('value', 0.0))
reward = float(reward) if not isinstance(reward, (float, np.floating)) else reward
# Sanitize done - ensure it's a boolean/float
if isinstance(done, dict):
done = done.get('done', done.get('value', False))
done = bool(done) if not isinstance(done, (bool, np.bool_)) else done
# Convert state to proper numpy array
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
# Add to sanitized lists
sanitized_states.append(state)
sanitized_next_states.append(next_state)
sanitized_experiences.append((state, action, reward, next_state, done))
except Exception as ex:
print(f"[DQNAgent] Bad experience at index {i}: {ex}")
continue
if not sanitized_states or not sanitized_next_states:
print("[DQNAgent] No valid states in replay batch.")
return 0.0 # Return float instead of None for consistency
# Validate all states have the same dimensions before stacking
expected_dim = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
if isinstance(expected_dim, tuple):
expected_dim = np.prod(expected_dim)
# Debug: Check what dimensions we're actually seeing
if sanitized_states:
actual_dims = [len(state) for state in sanitized_states[:5]] # Check first 5
logger.debug(f"DQN State dimensions - Expected: {expected_dim}, Actual samples: {actual_dims}")
# If all states have a consistent dimension different from expected, use that
unique_dims = list(set(len(state) for state in sanitized_states))
if len(unique_dims) == 1 and unique_dims[0] != expected_dim:
logger.warning(f"All states have dimension {unique_dims[0]} but expected {expected_dim}. Using actual dimension.")
expected_dim = unique_dims[0]
# Filter out states with wrong dimensions and fix them
valid_states = []
valid_next_states = []
# Sanitize and validate experiences
valid_experiences = []
for i, exp in enumerate(experiences):
try:
if len(exp) != 5:
logger.debug(f"Invalid experience format at index {i}: expected 5 elements, got {len(exp)}")
continue
state, action, reward, next_state, done = exp
# Validate state
state = self._validate_and_fix_state(state)
next_state = self._validate_and_fix_state(next_state)
if state is None or next_state is None:
continue
# Validate action
if isinstance(action, dict):
action = action.get('action', action.get('value', 0))
action = int(action) if action is not None else 0
action = max(0, min(action, self.n_actions - 1)) # Clamp to valid range
# Validate reward
if isinstance(reward, dict):
reward = reward.get('reward', reward.get('value', 0.0))
reward = float(reward) if reward is not None else 0.0
# Validate done flag
done = bool(done) if done is not None else False
valid_experiences.append((state, action, reward, next_state, done))
except Exception as e:
logger.debug(f"Error processing experience {i}: {e}")
continue
for i, (state, next_state, exp) in enumerate(zip(sanitized_states, sanitized_next_states, sanitized_experiences)):
# Ensure states have correct dimensions
if len(state) != expected_dim:
logger.debug(f"Fixing state dimension: {len(state)} -> {expected_dim}")
if len(state) < expected_dim:
# Pad with zeros
padded_state = np.zeros(expected_dim, dtype=np.float32)
padded_state[:len(state)] = state
state = padded_state
else:
# Truncate
state = state[:expected_dim]
if len(next_state) != expected_dim:
logger.debug(f"Fixing next_state dimension: {len(next_state)} -> {expected_dim}")
if len(next_state) < expected_dim:
# Pad with zeros
padded_next_state = np.zeros(expected_dim, dtype=np.float32)
padded_next_state[:len(next_state)] = next_state
next_state = padded_next_state
else:
# Truncate
next_state = next_state[:expected_dim]
valid_states.append(state)
valid_next_states.append(next_state)
valid_experiences.append(exp)
if not valid_states:
print("[DQNAgent] No valid states after dimension fixing.")
if len(valid_experiences) == 0:
logger.warning("No valid experiences after sanitization")
return 0.0
# Use validated experiences for training
experiences = valid_experiences
states = torch.FloatTensor(np.stack(valid_states)).to(self.device)
next_states = torch.FloatTensor(np.stack(valid_next_states)).to(self.device)
# Extract components
states, actions, rewards, next_states, dones = zip(*experiences)
# Choose appropriate replay method
if self.use_mixed_precision:
# Convert experiences to tensors for mixed precision
actions = torch.LongTensor(np.array([e[1] for e in experiences])).to(self.device)
rewards = torch.FloatTensor(np.array([e[2] for e in experiences])).to(self.device)
dones = torch.FloatTensor(np.array([e[4] for e in experiences])).to(self.device)
# Convert to tensors with proper validation
try:
states = torch.FloatTensor(np.array(states)).to(self.device)
actions = torch.LongTensor(np.array(actions)).to(self.device)
rewards = torch.FloatTensor(np.array(rewards)).to(self.device)
next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
dones = torch.FloatTensor(np.array(dones)).to(self.device)
# Use mixed precision replay
# Final validation of tensor shapes
if states.shape[0] == 0 or actions.shape[0] == 0:
logger.warning("Empty tensors after conversion")
return 0.0
# Ensure all tensors have the same batch size
batch_size = states.shape[0]
if not all(tensor.shape[0] == batch_size for tensor in [actions, rewards, next_states, dones]):
logger.warning("Inconsistent batch sizes across tensors")
return 0.0
except Exception as e:
logger.error(f"Error converting experiences to tensors: {e}")
return 0.0
# Choose training method based on precision mode
if self.use_mixed_precision:
loss = self._replay_mixed_precision(states, actions, rewards, next_states, dones)
else:
# Pass experiences directly to standard replay method
loss = self._replay_standard(experiences)
# Store loss for monitoring
loss = self._replay_standard(states, actions, rewards, next_states, dones)
# Update epsilon
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# Update statistics
self.losses.append(loss)
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# Randomly decide if we should train on extrema points from special memory
if random.random() < 0.3 and len(self.extrema_memory) >= self.batch_size:
# Train specifically on extrema memory examples
extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False)
extrema_batch = [self.extrema_memory[i] for i in extrema_indices]
# Sanitize extrema batch
sanitized_extrema = []
for e in extrema_batch:
try:
state, action, reward, next_state, done = e
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
sanitized_extrema.append((state, action, reward, next_state, done))
except:
continue
if sanitized_extrema:
# Extract tensors from extrema batch
extrema_states = torch.FloatTensor(np.array([e[0] for e in sanitized_extrema])).to(self.device)
extrema_actions = torch.LongTensor(np.array([e[1] for e in sanitized_extrema])).to(self.device)
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_extrema])).to(self.device)
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_extrema])).to(self.device)
extrema_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_extrema])).to(self.device)
# Use a slightly reduced learning rate for extrema training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
# Train on extrema memory
if self.use_mixed_precision:
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
else:
extrema_loss = self._replay_standard(sanitized_extrema)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log extrema loss
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
# Randomly train on price movement examples (similar to extrema)
if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size:
# Train specifically on price movement memory examples
price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False)
price_batch = [self.price_movement_memory[i] for i in price_indices]
# Sanitize price movement batch
sanitized_price = []
for e in price_batch:
try:
state, action, reward, next_state, done = e
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
sanitized_price.append((state, action, reward, next_state, done))
except:
continue
if sanitized_price:
# Extract tensors from price movement batch
price_states = torch.FloatTensor(np.array([e[0] for e in sanitized_price])).to(self.device)
price_actions = torch.LongTensor(np.array([e[1] for e in sanitized_price])).to(self.device)
price_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_price])).to(self.device)
price_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_price])).to(self.device)
price_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_price])).to(self.device)
# Use a slightly reduced learning rate for price movement training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
# Train on price movement memory
if self.use_mixed_precision:
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
else:
price_loss = self._replay_standard(sanitized_price)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log price movement loss
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
if len(self.losses) > 1000:
self.losses = self.losses[-500:] # Keep only recent losses
return loss
def _replay_standard(self, *args):
def _validate_and_fix_state(self, state):
"""Validate and fix state to ensure it has correct dimensions and no empty data"""
try:
# Convert to numpy if needed
if isinstance(state, torch.Tensor):
state = state.detach().cpu().numpy()
elif not isinstance(state, np.ndarray):
state = np.array(state, dtype=np.float32)
# Flatten if multi-dimensional
if state.ndim > 1:
state = state.flatten()
# Check for empty or invalid state
if state.size == 0:
logger.warning("Empty state detected, using default")
expected_size = getattr(self, 'state_size', 403)
if isinstance(expected_size, tuple):
expected_size = np.prod(expected_size)
return np.zeros(int(expected_size), dtype=np.float32)
# Check for NaN or infinite values
if np.any(np.isnan(state)) or np.any(np.isinf(state)):
logger.warning("NaN or infinite values in state, replacing with zeros")
state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
# Ensure correct dimensions
expected_size = getattr(self, 'state_size', 403)
if isinstance(expected_size, tuple):
expected_size = np.prod(expected_size)
expected_size = int(expected_size)
if len(state) != expected_size:
if len(state) < expected_size:
# Pad with zeros
padded_state = np.zeros(expected_size, dtype=np.float32)
padded_state[:len(state)] = state
state = padded_state
else:
# Truncate
state = state[:expected_size]
return state.astype(np.float32)
except Exception as e:
logger.error(f"Error validating state: {e}")
# Return default state as fallback
expected_size = getattr(self, 'state_size', 403)
if isinstance(expected_size, tuple):
expected_size = np.prod(expected_size)
return np.zeros(int(expected_size), dtype=np.float32)
def _replay_standard(self, states, actions, rewards, next_states, dones):
"""Standard training step without mixed precision"""
try:
# Support both (experiences,) and (states, actions, rewards, next_states, dones)
if len(args) == 1:
experiences = args[0]
# Use experiences if provided, otherwise sample from memory
if experiences is None:
# If memory is too small, skip training
if len(self.memory) < self.batch_size:
return 0.0
# Sample random mini-batch from memory
indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
batch = [self.memory[i] for i in indices]
experiences = batch
# Unpack experiences
states, actions, rewards, next_states, dones = zip(*experiences)
states = torch.FloatTensor(np.array(states)).to(self.device)
actions = torch.LongTensor(np.array(actions)).to(self.device)
rewards = torch.FloatTensor(np.array(rewards)).to(self.device)
next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
dones = torch.FloatTensor(np.array(dones)).to(self.device)
elif len(args) == 5:
states, actions, rewards, next_states, dones = args
else:
raise ValueError("Invalid arguments to _replay_standard")
# Validate input tensors
if states.shape[0] == 0:
logger.warning("Empty batch in _replay_standard")
return 0.0
# Get current Q values using safe wrapper
current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self._safe_cnn_forward(self.policy_net, states)
@ -1047,14 +959,14 @@ class DQNAgent:
next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
else:
# Standard DQN: Use target network for both selection and evaluation
next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states)
next_q_values, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch between rewards and next_q_values
if rewards.shape[0] != next_q_values.shape[0]:
logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index error
min_size = min(rewards.shape[0], next_q_values.shape[0])
# Ensure tensor shapes are consistent
batch_size = states.shape[0]
if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
logger.warning(f"Shape mismatch in replay: batch_size={batch_size}, rewards={rewards.shape}, next_q_values={next_q_values.shape}")
min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
@ -1063,70 +975,82 @@ class DQNAgent:
# Calculate target Q values
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute loss for Q value
q_loss = self.criterion(current_q_values, target_q_values)
# Compute loss for Q value - ensure tensors require gradients
if not current_q_values.requires_grad:
logger.warning("Current Q values do not require gradients")
return 0.0
q_loss = self.criterion(current_q_values, target_q_values.detach())
# Try to compute extrema loss if possible
# Initialize total loss with Q loss
total_loss = q_loss
# Add auxiliary losses if available and valid
try:
# Get the target classes from extrema predictions
extrema_targets = torch.argmax(current_extrema_pred, dim=1).long()
# Compute extrema loss using cross-entropy - this is an auxiliary task
extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets)
# Combined loss with emphasis on Q-learning
total_loss = q_loss + 0.1 * extrema_loss
if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
# Create simple extrema targets based on Q-values
with torch.no_grad():
extrema_targets = torch.ones(current_extrema_pred.shape[0], dtype=torch.long, device=current_extrema_pred.device) * 2 # Default to "neither"
extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets)
total_loss = total_loss + 0.1 * extrema_loss
except Exception as e:
logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
total_loss = q_loss
logger.debug(f"Could not calculate auxiliary loss: {e}")
# Reset gradients
self.optimizer.zero_grad()
# Ensure loss requires gradients before backward pass
# Ensure total loss requires gradients
if not total_loss.requires_grad:
logger.warning("Total loss tensor does not require gradients, skipping backward pass")
logger.warning("Total loss does not require gradients - policy network may not be in training mode")
self.policy_net.train() # Ensure training mode
return 0.0
# Backward pass
total_loss.backward()
# Enhanced gradient clipping with configurable norm
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.gradient_clip_norm)
# Gradient clipping
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
# Check if gradients are valid
has_valid_gradients = False
for param in self.policy_net.parameters():
if param.grad is not None and torch.any(torch.isfinite(param.grad)):
has_valid_gradients = True
break
if not has_valid_gradients:
logger.warning("No valid gradients found, skipping optimizer step")
return 0.0
# Update weights
self.optimizer.step()
# Enhanced target network update tracking
# Update target network periodically
self.training_steps += 1
if self.training_steps % self.target_update_freq == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
logger.debug(f"Target network updated at step {self.training_steps}")
# Enhanced statistics tracking
self.epsilon_history.append(self.epsilon)
# Calculate and store TD error for analysis
with torch.no_grad():
td_error = torch.abs(current_q_values - target_q_values).mean().item()
self.td_errors.append(td_error)
# Return loss
return total_loss.item()
except Exception as e:
logger.error(f"Error in replay standard: {str(e)}")
import traceback
logger.error(traceback.format_exc())
logger.error(f"Error in standard replay: {e}")
return 0.0
def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
"""Mixed precision training step for better GPU performance"""
# Check if mixed precision should be explicitly disabled
if 'DISABLE_MIXED_PRECISION' in os.environ:
logger.info("Mixed precision explicitly disabled by environment variable")
"""Mixed precision training step"""
if not self.use_mixed_precision:
logger.warning("Mixed precision not available, falling back to standard replay")
return self._replay_standard(states, actions, rewards, next_states, dones)
try:
# Validate input tensors
if states.shape[0] == 0:
logger.warning("Empty batch in _replay_mixed_precision")
return 0.0
# Zero gradients
self.optimizer.zero_grad()
@ -1135,21 +1059,28 @@ class DQNAgent:
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
with torch.cuda.amp.autocast():
# Get current Q values and extrema predictions
current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states)
# Get current Q values and predictions
current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self._safe_cnn_forward(self.policy_net, states)
current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
# Get next Q values from target network
with torch.no_grad():
next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states)
next_q_values = next_q_values.max(1)[0]
if self.use_double_dqn:
# Double DQN
policy_q_values, _, _, _, _ = self._safe_cnn_forward(self.policy_net, next_states)
next_actions = policy_q_values.argmax(1)
target_q_values_all, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
else:
# Standard DQN
next_q_values, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
next_q_values = next_q_values.max(1)[0]
# Check for dimension mismatch and fix it
if rewards.shape[0] != next_q_values.shape[0]:
# Log the shape mismatch for debugging
logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
# Use the smaller size to prevent index errors
min_size = min(rewards.shape[0], next_q_values.shape[0])
# Ensure consistent shapes
batch_size = states.shape[0]
if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
logger.warning(f"Shape mismatch in mixed precision replay")
min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
rewards = rewards[:min_size]
dones = dones[:min_size]
next_q_values = next_q_values[:min_size]
@ -1158,147 +1089,63 @@ class DQNAgent:
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
# Compute Q-value loss (primary task)
q_loss = nn.MSELoss()(current_q_values, target_q_values)
q_loss = nn.MSELoss()(current_q_values, target_q_values.detach())
# Initialize loss with q_loss
loss = q_loss
# Try to extract price from current and next states
# Add auxiliary losses if available
try:
# Extract price feature from sequence data (if available)
if len(states.shape) == 3: # [batch, seq, features]
current_prices = states[:, -1, -1] # Last timestep, last feature
next_prices = next_states[:, -1, -1]
else: # [batch, features]
current_prices = states[:, -1] # Last feature
next_prices = next_states[:, -1]
# Calculate price change for different timeframes
immediate_changes = (next_prices - current_prices) / current_prices
# Get the actual batch size for this calculation
actual_batch_size = states.shape[0]
# Create price direction labels - simplified for training
# 0 = down, 1 = sideways, 2 = up
immediate_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Default: sideways
midterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1
longterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1
# Immediate term direction (1s, 1m)
immediate_up = (immediate_changes > 0.0005)
immediate_down = (immediate_changes < -0.0005)
immediate_labels[immediate_up] = 2 # Up
immediate_labels[immediate_down] = 0 # Down
# For mid and long term, we can only approximate during training
# In a real system, we'd need historical data to validate these
# Here we'll use the immediate term with increasing thresholds as approximation
# Mid-term (1h) - use slightly higher threshold
midterm_up = (immediate_changes > 0.001)
midterm_down = (immediate_changes < -0.001)
midterm_labels[midterm_up] = 2 # Up
midterm_labels[midterm_down] = 0 # Down
# Long-term (1d) - use even higher threshold
longterm_up = (immediate_changes > 0.002)
longterm_down = (immediate_changes < -0.002)
longterm_labels[longterm_up] = 2 # Up
longterm_labels[longterm_down] = 0 # Down
# Generate target values for price change regression
# For simplicity, we'll use the immediate change and scaled versions for longer timeframes
price_value_targets = torch.zeros((actual_batch_size, 4), device=self.device)
price_value_targets[:, 0] = immediate_changes
price_value_targets[:, 1] = immediate_changes * 2.0 # Approximate 1h change
price_value_targets[:, 2] = immediate_changes * 4.0 # Approximate 1d change
price_value_targets[:, 3] = immediate_changes * 6.0 # Approximate 1w change
# Calculate loss for price direction prediction (classification)
if len(current_price_pred['immediate'].shape) > 1 and current_price_pred['immediate'].shape[0] >= actual_batch_size:
# Slice predictions to match the adjusted batch size
immediate_pred = current_price_pred['immediate'][:actual_batch_size]
midterm_pred = current_price_pred['midterm'][:actual_batch_size]
longterm_pred = current_price_pred['longterm'][:actual_batch_size]
price_values_pred = current_price_pred['values'][:actual_batch_size]
if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
# Simple extrema targets
with torch.no_grad():
extrema_targets = torch.ones(current_extrema_pred.shape[0], dtype=torch.long, device=current_extrema_pred.device) * 2
# Compute losses for each task
immediate_loss = nn.CrossEntropyLoss()(immediate_pred, immediate_labels)
midterm_loss = nn.CrossEntropyLoss()(midterm_pred, midterm_labels)
longterm_loss = nn.CrossEntropyLoss()(longterm_pred, longterm_labels)
extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets)
loss = loss + 0.1 * extrema_loss
# MSE loss for price value regression
price_value_loss = nn.MSELoss()(price_values_pred, price_value_targets)
# Combine all price prediction losses
price_loss = immediate_loss + 0.7 * midterm_loss + 0.5 * longterm_loss + 0.3 * price_value_loss
# Create extrema labels (same as before)
extrema_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 2 # Default: neither
# Identify potential bottoms (significant negative change)
bottoms = (immediate_changes < -0.003)
extrema_labels[bottoms] = 0
# Identify potential tops (significant positive change)
tops = (immediate_changes > 0.003)
extrema_labels[tops] = 1
# Calculate extrema prediction loss
if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= actual_batch_size:
current_extrema_pred = current_extrema_pred[:actual_batch_size]
extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
# Combined loss with all components
# Primary task: Q-value learning (RL objective)
# Secondary tasks: extrema detection and price prediction (supervised objectives)
loss = q_loss + 0.3 * extrema_loss + 0.3 * price_loss
# Log loss components occasionally
if random.random() < 0.01: # Log 1% of the time
logger.info(
f"Mixed precision losses: Q-loss={q_loss.item():.4f}, "
f"Extrema-loss={extrema_loss.item():.4f}, "
f"Price-loss={price_loss.item():.4f}"
)
except Exception as e:
# Fallback if price extraction fails
logger.warning(f"Failed to calculate price prediction loss: {str(e)}. Using only Q-value loss.")
# Just use Q-value loss
loss = q_loss
# Ensure loss requires gradients before backward pass
if not loss.requires_grad:
logger.warning("Loss tensor does not require gradients, skipping backward pass")
return 0.0
# Backward pass with scaled gradients
self.scaler.scale(loss).backward()
# Gradient clipping on scaled gradients
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
# Update with scaler
self.scaler.step(self.optimizer)
self.scaler.update()
# Update target network if needed
self.update_count += 1
if self.update_count % self.target_update == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
# Track and decay epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
return loss.item()
logger.debug(f"Could not add auxiliary loss in mixed precision: {e}")
# Check if loss requires gradients
if not loss.requires_grad:
logger.warning("Loss does not require gradients in mixed precision training")
return 0.0
# Scale and backward pass
self.scaler.scale(loss).backward()
# Unscale gradients and clip
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
# Check for valid gradients
has_valid_gradients = False
for param in self.policy_net.parameters():
if param.grad is not None and torch.any(torch.isfinite(param.grad)):
has_valid_gradients = True
break
if not has_valid_gradients:
logger.warning("No valid gradients in mixed precision training")
self.scaler.update() # Still update scaler
return 0.0
# Optimizer step with scaler
self.scaler.step(self.optimizer)
self.scaler.update()
# Update target network
self.training_steps += 1
if self.training_steps % self.target_update_freq == 0:
self.target_net.load_state_dict(self.policy_net.state_dict())
logger.debug(f"Target network updated at step {self.training_steps}")
return loss.item()
except Exception as e:
logger.error(f"Error in mixed precision training: {str(e)}")
logger.warning("Falling back to standard precision training")
# Fall back to standard training
return self._replay_standard(states, actions, rewards, next_states, dones)
logger.error(f"Error in mixed precision replay: {e}")
return 0.0
def train_on_extrema(self, states, actions, rewards, next_states, dones):
"""