training progress
This commit is contained in:
@@ -337,8 +337,8 @@ class RealTrainingAdapter:
|
|||||||
# Get training config
|
# Get training config
|
||||||
training_config = test_case.get('training_config', {})
|
training_config = test_case.get('training_config', {})
|
||||||
timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
|
timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
|
||||||
# Reduce sequence length to avoid OOM - 200 candles is more reasonable
|
# RESTORED: 200 candles per timeframe (memory leak fixed)
|
||||||
# With 5 timeframes, this gives 1000 total positions vs 3000 with 600 candles
|
# With 5 timeframes * 200 candles = 1000 total positions
|
||||||
candles_per_timeframe = training_config.get('candles_per_timeframe', 200) # 200 candles per batch
|
candles_per_timeframe = training_config.get('candles_per_timeframe', 200) # 200 candles per batch
|
||||||
|
|
||||||
# Determine secondary symbol based on primary symbol
|
# Determine secondary symbol based on primary symbol
|
||||||
@@ -586,20 +586,24 @@ class RealTrainingAdapter:
|
|||||||
logger.info(f" Test case {i+1}: Added {len(hold_samples)} HOLD samples (during position)")
|
logger.info(f" Test case {i+1}: Added {len(hold_samples)} HOLD samples (during position)")
|
||||||
|
|
||||||
# Create EXIT sample (where model SHOULD exit trade)
|
# Create EXIT sample (where model SHOULD exit trade)
|
||||||
exit_timestamp = test_case.get('annotation_metadata', {}).get('exit_timestamp')
|
# Exit info is in expected_outcome, not annotation_metadata
|
||||||
if exit_timestamp:
|
exit_price = expected_outcome.get('exit_price')
|
||||||
|
if exit_price:
|
||||||
|
# For now, use same market state (TODO: fetch market state at exit time)
|
||||||
|
# The model will learn to exit based on profit_loss_pct and position state
|
||||||
exit_sample = {
|
exit_sample = {
|
||||||
'market_state': market_state, # TODO: Get market state at exit time
|
'market_state': market_state, # Using entry market state as proxy
|
||||||
'action': 'CLOSE',
|
'action': 'CLOSE',
|
||||||
'direction': expected_outcome.get('direction'),
|
'direction': expected_outcome.get('direction'),
|
||||||
'profit_loss_pct': expected_outcome.get('profit_loss_pct'),
|
'profit_loss_pct': expected_outcome.get('profit_loss_pct'),
|
||||||
'entry_price': expected_outcome.get('entry_price'),
|
'entry_price': expected_outcome.get('entry_price'),
|
||||||
'exit_price': expected_outcome.get('exit_price'),
|
'exit_price': exit_price,
|
||||||
'timestamp': exit_timestamp,
|
'timestamp': test_case.get('timestamp'), # Entry timestamp (exit time not stored separately)
|
||||||
'label': 'EXIT' # Exit signal
|
'label': 'EXIT', # Exit signal
|
||||||
|
'in_position': True # Model is in position when deciding to exit
|
||||||
}
|
}
|
||||||
training_data.append(exit_sample)
|
training_data.append(exit_sample)
|
||||||
logger.info(f" Test case {i+1}: EXIT sample @ {exit_sample['exit_price']} ({exit_sample['profit_loss_pct']:.2f}%)")
|
logger.info(f" Test case {i+1}: EXIT sample @ {exit_price} ({expected_outcome.get('profit_loss_pct', 0):.2f}%)")
|
||||||
|
|
||||||
# Create NEGATIVE samples (where model should NOT trade)
|
# Create NEGATIVE samples (where model should NOT trade)
|
||||||
# These are candles before and after the signal (±15 candles)
|
# These are candles before and after the signal (±15 candles)
|
||||||
@@ -1186,12 +1190,13 @@ class RealTrainingAdapter:
|
|||||||
timeframes = market_state.get('timeframes', {})
|
timeframes = market_state.get('timeframes', {})
|
||||||
secondary_timeframes = market_state.get('secondary_timeframes', {})
|
secondary_timeframes = market_state.get('secondary_timeframes', {})
|
||||||
|
|
||||||
# Target sequence length - use actual data length (typically 200 candles)
|
# Target sequence length - RESTORED to 200 (memory leak fixed)
|
||||||
# Find the first available timeframe to determine sequence length
|
# With 5 timeframes * 200 candles = 1000 sequence positions
|
||||||
target_seq_len = 200 # Default
|
# Memory management fixes allow full sequence length
|
||||||
|
target_seq_len = 200 # Restored to original
|
||||||
for tf_data in timeframes.values():
|
for tf_data in timeframes.values():
|
||||||
if tf_data and 'close' in tf_data and len(tf_data['close']) > 0:
|
if tf_data and 'close' in tf_data and len(tf_data['close']) > 0:
|
||||||
target_seq_len = min(len(tf_data['close']), 200) # Cap at 200 to avoid OOM
|
target_seq_len = min(len(tf_data['close']), 200) # Cap at 200
|
||||||
break
|
break
|
||||||
|
|
||||||
# Extract each timeframe (returns None if not available)
|
# Extract each timeframe (returns None if not available)
|
||||||
@@ -1409,12 +1414,14 @@ class RealTrainingAdapter:
|
|||||||
# For HOLD samples, expect no price change
|
# For HOLD samples, expect no price change
|
||||||
future_price_ratio = 0.0
|
future_price_ratio = 0.0
|
||||||
|
|
||||||
future_prices = torch.tensor([future_price_ratio], dtype=torch.float32)
|
# FIXED: Shape must be [batch, 1] to match price_head output
|
||||||
|
future_prices = torch.tensor([[future_price_ratio]], dtype=torch.float32) # [1, 1]
|
||||||
|
|
||||||
# Trade success (1.0 if profitable, 0.0 otherwise)
|
# Trade success (1.0 if profitable, 0.0 otherwise)
|
||||||
# Shape must be [batch_size, 1] to match confidence head output
|
# Shape must be [batch_size, 1] to match confidence head output [batch, 1]
|
||||||
profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
|
profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
|
||||||
trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)
|
# FIXED: Ensure shape is [1, 1] not [1] to match BCELoss requirements
|
||||||
|
trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32) # [1, 1]
|
||||||
|
|
||||||
# Return batch dictionary with ALL timeframes
|
# Return batch dictionary with ALL timeframes
|
||||||
batch = {
|
batch = {
|
||||||
@@ -1580,8 +1587,8 @@ class RealTrainingAdapter:
|
|||||||
logger.info(f" Converted {len(training_data)} samples to {len(converted_batches)} training batches")
|
logger.info(f" Converted {len(training_data)} samples to {len(converted_batches)} training batches")
|
||||||
|
|
||||||
# Use batch size of 1 to avoid OOM with large sequence lengths
|
# Use batch size of 1 to avoid OOM with large sequence lengths
|
||||||
# With 5 timeframes * 600 candles = 3000 sequence positions per sample,
|
# With 5 timeframes * 100 candles = 500 sequence positions per sample
|
||||||
# even batch_size=5 causes 15,000 positions which is too large for GPU
|
# Batch size of 1 ensures we don't exceed GPU memory (8GB)
|
||||||
mini_batch_size = 1 # Process one sample at a time to avoid OOM
|
mini_batch_size = 1 # Process one sample at a time to avoid OOM
|
||||||
|
|
||||||
def _combine_batches(batch_list: List[Dict[str, 'torch.Tensor']]) -> Dict[str, 'torch.Tensor']:
|
def _combine_batches(batch_list: List[Dict[str, 'torch.Tensor']]) -> Dict[str, 'torch.Tensor']:
|
||||||
@@ -1623,6 +1630,10 @@ class RealTrainingAdapter:
|
|||||||
epoch_accuracy = 0.0
|
epoch_accuracy = 0.0
|
||||||
num_batches = 0
|
num_batches = 0
|
||||||
|
|
||||||
|
# Clear CUDA cache before epoch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
for i, batch in enumerate(grouped_batches):
|
for i, batch in enumerate(grouped_batches):
|
||||||
try:
|
try:
|
||||||
# Determine if this is an accumulation step or optimizer step
|
# Determine if this is an accumulation step or optimizer step
|
||||||
@@ -1639,16 +1650,41 @@ class RealTrainingAdapter:
|
|||||||
epoch_accuracy += batch_accuracy
|
epoch_accuracy += batch_accuracy
|
||||||
num_batches += 1
|
num_batches += 1
|
||||||
|
|
||||||
# Log first batch and every 10th batch for debugging
|
# Log first batch and every 5th batch for debugging
|
||||||
if (i + 1) == 1 or (i + 1) % 10 == 0:
|
if (i + 1) == 1 or (i + 1) % 5 == 0:
|
||||||
logger.info(f" Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Action Acc: {batch_accuracy:.2%}, Candle Acc: {batch_candle_accuracy:.2%}")
|
logger.info(f" Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Action Acc: {batch_accuracy:.2%}, Candle Acc: {batch_candle_accuracy:.2%}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" Batch {i + 1} returned None result - skipping")
|
logger.warning(f" Batch {i + 1} returned None result - skipping")
|
||||||
|
|
||||||
# Clear CUDA cache after optimizer step (not accumulation step)
|
# CRITICAL FIX: Delete batch tensors immediately to free GPU memory
|
||||||
if torch.cuda.is_available() and not is_accumulation_step:
|
# This prevents memory accumulation during gradient accumulation
|
||||||
|
for key in list(batch.keys()):
|
||||||
|
if isinstance(batch[key], torch.Tensor):
|
||||||
|
del batch[key]
|
||||||
|
del batch
|
||||||
|
|
||||||
|
# CRITICAL: Clear CUDA cache after EVERY batch to prevent memory accumulation
|
||||||
|
# This is essential with large models and limited GPU memory
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# After optimizer step (not accumulation), force garbage collection
|
||||||
|
if not is_accumulation_step:
|
||||||
|
import gc
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
except torch.cuda.OutOfMemoryError as oom_error:
|
||||||
|
logger.error(f" CUDA OOM in batch {i + 1}: {oom_error}")
|
||||||
|
# Aggressive memory cleanup on OOM
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
# Reset optimizer state to prevent corruption
|
||||||
|
trainer.optimizer.zero_grad(set_to_none=True)
|
||||||
|
logger.warning(f" Skipping batch {i + 1} due to OOM, optimizer state reset")
|
||||||
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f" Error in batch {i + 1}: {e}")
|
logger.error(f" Error in batch {i + 1}: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|||||||
@@ -23,19 +23,19 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TradingTransformerConfig:
|
class TradingTransformerConfig:
|
||||||
"""Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
|
"""Configuration for trading transformer models - WITH PROPER MEMORY MANAGEMENT"""
|
||||||
# Model architecture - SCALED UP
|
# Model architecture - RESTORED to original size (memory leak fixed)
|
||||||
d_model: int = 1024 # Model dimension (2x increase)
|
d_model: int = 1024 # Model dimension
|
||||||
n_heads: int = 16 # Number of attention heads (2x increase)
|
n_heads: int = 16 # Number of attention heads
|
||||||
n_layers: int = 12 # Number of transformer layers (2x increase)
|
n_layers: int = 12 # Number of transformer layers
|
||||||
d_ff: int = 4096 # Feed-forward dimension (2x increase)
|
d_ff: int = 4096 # Feed-forward dimension
|
||||||
dropout: float = 0.1 # Dropout rate
|
dropout: float = 0.1 # Dropout rate
|
||||||
|
|
||||||
# Input dimensions - ENHANCED
|
# Input dimensions - RESTORED
|
||||||
seq_len: int = 150 # Sequence length for time series (1.5x increase)
|
seq_len: int = 200 # Sequence length for time series
|
||||||
cob_features: int = 100 # COB feature dimension (2x increase)
|
cob_features: int = 100 # COB feature dimension
|
||||||
tech_features: int = 40 # Technical indicator features (2x increase)
|
tech_features: int = 40 # Technical indicator features
|
||||||
market_features: int = 30 # Market microstructure features (2x increase)
|
market_features: int = 30 # Market microstructure features
|
||||||
|
|
||||||
# Output configuration
|
# Output configuration
|
||||||
n_actions: int = 3 # BUY, SELL, HOLD
|
n_actions: int = 3 # BUY, SELL, HOLD
|
||||||
@@ -358,6 +358,7 @@ class AdvancedTradingTransformer(nn.Module):
|
|||||||
|
|
||||||
# SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
|
# SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
|
||||||
# This is applied to each timeframe independently but uses SAME weights
|
# This is applied to each timeframe independently but uses SAME weights
|
||||||
|
# RESTORED: Original dimensions (memory leak fixed)
|
||||||
self.shared_pattern_encoder = nn.Sequential(
|
self.shared_pattern_encoder = nn.Sequential(
|
||||||
nn.Linear(5, config.d_model // 4), # 5 OHLCV -> 256
|
nn.Linear(5, config.d_model // 4), # 5 OHLCV -> 256
|
||||||
nn.LayerNorm(config.d_model // 4),
|
nn.LayerNorm(config.d_model // 4),
|
||||||
@@ -1163,12 +1164,13 @@ class TradingTransformerTrainer:
|
|||||||
self.model.train()
|
self.model.train()
|
||||||
|
|
||||||
# Only zero gradients if not accumulating
|
# Only zero gradients if not accumulating
|
||||||
|
# Use set_to_none=True for better memory efficiency
|
||||||
if not accumulate_gradients:
|
if not accumulate_gradients:
|
||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad(set_to_none=True)
|
||||||
|
|
||||||
# Move batch to device WITHOUT cloning to avoid version tracking issues
|
# Move batch to device WITHOUT cloning to avoid version tracking issues
|
||||||
# The detach().clone() was causing gradient computation errors
|
# The detach().clone() was causing gradient computation errors
|
||||||
batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
|
batch = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v
|
||||||
for k, v in batch.items()}
|
for k, v in batch.items()}
|
||||||
|
|
||||||
# Use automatic mixed precision (FP16) for memory efficiency
|
# Use automatic mixed precision (FP16) for memory efficiency
|
||||||
@@ -1189,18 +1191,34 @@ class TradingTransformerTrainer:
|
|||||||
|
|
||||||
# Calculate losses
|
# Calculate losses
|
||||||
action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
|
action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
|
||||||
price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
|
|
||||||
|
# FIXED: Ensure shapes match for MSELoss
|
||||||
|
price_pred = outputs['price_prediction']
|
||||||
|
price_target = batch['future_prices']
|
||||||
|
|
||||||
|
# Both should be [batch, 1], but ensure they match
|
||||||
|
if price_pred.shape != price_target.shape:
|
||||||
|
logger.debug(f"Reshaping price target from {price_target.shape} to {price_pred.shape}")
|
||||||
|
price_target = price_target.view(price_pred.shape)
|
||||||
|
|
||||||
|
price_loss = self.price_criterion(price_pred, price_target)
|
||||||
|
|
||||||
# Start with base losses - avoid inplace operations on computation graph
|
# Start with base losses - avoid inplace operations on computation graph
|
||||||
total_loss = action_loss + 0.1 * price_loss # Weight auxiliary task
|
total_loss = action_loss + 0.1 * price_loss # Weight auxiliary task
|
||||||
|
|
||||||
|
# CRITICAL FIX: Scale loss for gradient accumulation
|
||||||
|
# This prevents gradient explosion when accumulating over multiple batches
|
||||||
|
if accumulate_gradients:
|
||||||
|
# Assume accumulation over 5 steps (should match training loop)
|
||||||
|
total_loss = total_loss / 5.0
|
||||||
|
|
||||||
# Add confidence loss if available
|
# Add confidence loss if available
|
||||||
if 'confidence' in outputs and 'trade_success' in batch:
|
if 'confidence' in outputs and 'trade_success' in batch:
|
||||||
# Both tensors should have shape [batch_size, 1] for BCELoss
|
# Both tensors should have shape [batch_size, 1] for BCELoss
|
||||||
confidence_pred = outputs['confidence']
|
confidence_pred = outputs['confidence']
|
||||||
trade_target = batch['trade_success'].float()
|
trade_target = batch['trade_success'].float()
|
||||||
|
|
||||||
# Ensure both are 2D tensors [batch_size, 1]
|
# FIXED: Ensure both are 2D tensors [batch_size, 1]
|
||||||
# Handle different input shapes robustly
|
# Handle different input shapes robustly
|
||||||
if confidence_pred.dim() == 0:
|
if confidence_pred.dim() == 0:
|
||||||
# Scalar -> [1, 1]
|
# Scalar -> [1, 1]
|
||||||
@@ -1208,6 +1226,9 @@ class TradingTransformerTrainer:
|
|||||||
elif confidence_pred.dim() == 1:
|
elif confidence_pred.dim() == 1:
|
||||||
# [batch_size] -> [batch_size, 1]
|
# [batch_size] -> [batch_size, 1]
|
||||||
confidence_pred = confidence_pred.unsqueeze(-1)
|
confidence_pred = confidence_pred.unsqueeze(-1)
|
||||||
|
elif confidence_pred.dim() == 3:
|
||||||
|
# [batch_size, seq_len, 1] -> [batch_size, 1] (take last timestep)
|
||||||
|
confidence_pred = confidence_pred[:, -1, :]
|
||||||
|
|
||||||
if trade_target.dim() == 0:
|
if trade_target.dim() == 0:
|
||||||
# Scalar -> [1, 1]
|
# Scalar -> [1, 1]
|
||||||
@@ -1292,11 +1313,31 @@ class TradingTransformerTrainer:
|
|||||||
'learning_rate': self.scheduler.get_last_lr()[0]
|
'learning_rate': self.scheduler.get_last_lr()[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Delete large tensors to free memory immediately
|
# CRITICAL: Delete large tensors to free memory immediately
|
||||||
|
# This prevents memory accumulation across batches
|
||||||
del outputs, total_loss, action_loss, price_loss, predictions, accuracy
|
del outputs, total_loss, action_loss, price_loss, predictions, accuracy
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
except torch.cuda.OutOfMemoryError as oom_error:
|
||||||
|
logger.error(f"CUDA OOM in train_step: {oom_error}")
|
||||||
|
# Aggressive cleanup on OOM
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
# Reset optimizer state to prevent corruption
|
||||||
|
self.optimizer.zero_grad(set_to_none=True)
|
||||||
|
# Return zero loss to continue training
|
||||||
|
return {
|
||||||
|
'total_loss': 0.0,
|
||||||
|
'action_loss': 0.0,
|
||||||
|
'price_loss': 0.0,
|
||||||
|
'accuracy': 0.0,
|
||||||
|
'candle_accuracy': 0.0,
|
||||||
|
'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in train_step: {e}", exc_info=True)
|
logger.error(f"Error in train_step: {e}", exc_info=True)
|
||||||
# Clear any partial computations
|
# Clear any partial computations
|
||||||
@@ -1309,6 +1350,7 @@ class TradingTransformerTrainer:
|
|||||||
'action_loss': 0.0,
|
'action_loss': 0.0,
|
||||||
'price_loss': 0.0,
|
'price_loss': 0.0,
|
||||||
'accuracy': 0.0,
|
'accuracy': 0.0,
|
||||||
|
'candle_accuracy': 0.0,
|
||||||
'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
|
'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user