training progress

This commit is contained in:
Dobromir Popov
2025-11-10 20:01:07 +02:00
parent a2d34c6d7c
commit 999dea9eb0
2 changed files with 116 additions and 38 deletions

View File

@@ -23,19 +23,19 @@ logger = logging.getLogger(__name__)
@dataclass
class TradingTransformerConfig:
"""Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
# Model architecture - SCALED UP
d_model: int = 1024 # Model dimension (2x increase)
n_heads: int = 16 # Number of attention heads (2x increase)
n_layers: int = 12 # Number of transformer layers (2x increase)
d_ff: int = 4096 # Feed-forward dimension (2x increase)
"""Configuration for trading transformer models - WITH PROPER MEMORY MANAGEMENT"""
# Model architecture - RESTORED to original size (memory leak fixed)
d_model: int = 1024 # Model dimension
n_heads: int = 16 # Number of attention heads
n_layers: int = 12 # Number of transformer layers
d_ff: int = 4096 # Feed-forward dimension
dropout: float = 0.1 # Dropout rate
# Input dimensions - ENHANCED
seq_len: int = 150 # Sequence length for time series (1.5x increase)
cob_features: int = 100 # COB feature dimension (2x increase)
tech_features: int = 40 # Technical indicator features (2x increase)
market_features: int = 30 # Market microstructure features (2x increase)
# Input dimensions - RESTORED
seq_len: int = 200 # Sequence length for time series
cob_features: int = 100 # COB feature dimension
tech_features: int = 40 # Technical indicator features
market_features: int = 30 # Market microstructure features
# Output configuration
n_actions: int = 3 # BUY, SELL, HOLD
@@ -358,6 +358,7 @@ class AdvancedTradingTransformer(nn.Module):
# SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
# This is applied to each timeframe independently but uses SAME weights
# RESTORED: Original dimensions (memory leak fixed)
self.shared_pattern_encoder = nn.Sequential(
nn.Linear(5, config.d_model // 4), # 5 OHLCV -> 256
nn.LayerNorm(config.d_model // 4),
@@ -1163,12 +1164,13 @@ class TradingTransformerTrainer:
self.model.train()
# Only zero gradients if not accumulating
# Use set_to_none=True for better memory efficiency
if not accumulate_gradients:
self.optimizer.zero_grad()
self.optimizer.zero_grad(set_to_none=True)
# Move batch to device WITHOUT cloning to avoid version tracking issues
# The detach().clone() was causing gradient computation errors
batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
batch = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v
for k, v in batch.items()}
# Use automatic mixed precision (FP16) for memory efficiency
@@ -1189,10 +1191,26 @@ class TradingTransformerTrainer:
# Calculate losses
action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
# FIXED: Ensure shapes match for MSELoss
price_pred = outputs['price_prediction']
price_target = batch['future_prices']
# Both should be [batch, 1], but ensure they match
if price_pred.shape != price_target.shape:
logger.debug(f"Reshaping price target from {price_target.shape} to {price_pred.shape}")
price_target = price_target.view(price_pred.shape)
price_loss = self.price_criterion(price_pred, price_target)
# Start with base losses - avoid inplace operations on computation graph
total_loss = action_loss + 0.1 * price_loss # Weight auxiliary task
# CRITICAL FIX: Scale loss for gradient accumulation
# This prevents gradient explosion when accumulating over multiple batches
if accumulate_gradients:
# Assume accumulation over 5 steps (should match training loop)
total_loss = total_loss / 5.0
# Add confidence loss if available
if 'confidence' in outputs and 'trade_success' in batch:
@@ -1200,7 +1218,7 @@ class TradingTransformerTrainer:
confidence_pred = outputs['confidence']
trade_target = batch['trade_success'].float()
# Ensure both are 2D tensors [batch_size, 1]
# FIXED: Ensure both are 2D tensors [batch_size, 1]
# Handle different input shapes robustly
if confidence_pred.dim() == 0:
# Scalar -> [1, 1]
@@ -1208,6 +1226,9 @@ class TradingTransformerTrainer:
elif confidence_pred.dim() == 1:
# [batch_size] -> [batch_size, 1]
confidence_pred = confidence_pred.unsqueeze(-1)
elif confidence_pred.dim() == 3:
# [batch_size, seq_len, 1] -> [batch_size, 1] (take last timestep)
confidence_pred = confidence_pred[:, -1, :]
if trade_target.dim() == 0:
# Scalar -> [1, 1]
@@ -1292,11 +1313,31 @@ class TradingTransformerTrainer:
'learning_rate': self.scheduler.get_last_lr()[0]
}
# Delete large tensors to free memory immediately
# CRITICAL: Delete large tensors to free memory immediately
# This prevents memory accumulation across batches
del outputs, total_loss, action_loss, price_loss, predictions, accuracy
if torch.cuda.is_available():
torch.cuda.empty_cache()
return result
except torch.cuda.OutOfMemoryError as oom_error:
logger.error(f"CUDA OOM in train_step: {oom_error}")
# Aggressive cleanup on OOM
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Reset optimizer state to prevent corruption
self.optimizer.zero_grad(set_to_none=True)
# Return zero loss to continue training
return {
'total_loss': 0.0,
'action_loss': 0.0,
'price_loss': 0.0,
'accuracy': 0.0,
'candle_accuracy': 0.0,
'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
}
except Exception as e:
logger.error(f"Error in train_step: {e}", exc_info=True)
# Clear any partial computations
@@ -1309,6 +1350,7 @@ class TradingTransformerTrainer:
'action_loss': 0.0,
'price_loss': 0.0,
'accuracy': 0.0,
'candle_accuracy': 0.0,
'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
}