training progress

2025-11-10 20:01:07 +02:00
parent a2d34c6d7c
commit 999dea9eb0
2 changed files with 116 additions and 38 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -337,8 +337,8 @@ class RealTrainingAdapter:
            # Get training config
            training_config = test_case.get('training_config', {})
            timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
-            # Reduce sequence length to avoid OOM - 200 candles is more reasonable
+            # RESTORED: 200 candles per timeframe (memory leak fixed)
-            # With 5 timeframes, this gives 1000 total positions vs 3000 with 600 candles
+            # With 5 timeframes * 200 candles = 1000 total positions
            candles_per_timeframe = training_config.get('candles_per_timeframe', 200)  # 200 candles per batch
            # Determine secondary symbol based on primary symbol
@@ -586,20 +586,24 @@ class RealTrainingAdapter:
                    logger.info(f"   Test case {i+1}: Added {len(hold_samples)} HOLD samples (during position)")
                # Create EXIT sample (where model SHOULD exit trade)
-                exit_timestamp = test_case.get('annotation_metadata', {}).get('exit_timestamp')
+                # Exit info is in expected_outcome, not annotation_metadata
-                if exit_timestamp:
+                exit_price = expected_outcome.get('exit_price')
                if exit_price:
                    # For now, use same market state (TODO: fetch market state at exit time)
                    # The model will learn to exit based on profit_loss_pct and position state
                    exit_sample = {
-                        'market_state': market_state,  # TODO: Get market state at exit time
+                        'market_state': market_state,  # Using entry market state as proxy
                        'action': 'CLOSE',
                        'direction': expected_outcome.get('direction'),
                        'profit_loss_pct': expected_outcome.get('profit_loss_pct'),
                        'entry_price': expected_outcome.get('entry_price'),
-                        'exit_price': expected_outcome.get('exit_price'),
+                        'exit_price': exit_price,
-                        'timestamp': exit_timestamp,
+                        'timestamp': test_case.get('timestamp'),  # Entry timestamp (exit time not stored separately)
-                        'label': 'EXIT'  # Exit signal
+                        'label': 'EXIT',  # Exit signal
                        'in_position': True  # Model is in position when deciding to exit
                    }
                    training_data.append(exit_sample)
-                    logger.info(f"   Test case {i+1}: EXIT sample @ {exit_sample['exit_price']} ({exit_sample['profit_loss_pct']:.2f}%)")
+                    logger.info(f"   Test case {i+1}: EXIT sample @ {exit_price} ({expected_outcome.get('profit_loss_pct', 0):.2f}%)")
                # Create NEGATIVE samples (where model should NOT trade)
                # These are candles before and after the signal (±15 candles)
@@ -1186,12 +1190,13 @@ class RealTrainingAdapter:
            timeframes = market_state.get('timeframes', {})
            secondary_timeframes = market_state.get('secondary_timeframes', {})
-            # Target sequence length - use actual data length (typically 200 candles)
+            # Target sequence length - RESTORED to 200 (memory leak fixed)
-            # Find the first available timeframe to determine sequence length
+            # With 5 timeframes * 200 candles = 1000 sequence positions
-            target_seq_len = 200  # Default
+            # Memory management fixes allow full sequence length
            target_seq_len = 200  # Restored to original
            for tf_data in timeframes.values():
                if tf_data and 'close' in tf_data and len(tf_data['close']) > 0:
-                    target_seq_len = min(len(tf_data['close']), 200)  # Cap at 200 to avoid OOM
+                    target_seq_len = min(len(tf_data['close']), 200)  # Cap at 200
                    break
            # Extract each timeframe (returns None if not available)
@@ -1409,12 +1414,14 @@ class RealTrainingAdapter:
                # For HOLD samples, expect no price change
                future_price_ratio = 0.0
-            future_prices = torch.tensor([future_price_ratio], dtype=torch.float32)
+            # FIXED: Shape must be [batch, 1] to match price_head output
            future_prices = torch.tensor([[future_price_ratio]], dtype=torch.float32)  # [1, 1]
            # Trade success (1.0 if profitable, 0.0 otherwise)
-            # Shape must be [batch_size, 1] to match confidence head output
+            # Shape must be [batch_size, 1] to match confidence head output [batch, 1]
            profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
-            trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)
+            # FIXED: Ensure shape is [1, 1] not [1] to match BCELoss requirements
            trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)  # [1, 1]
            # Return batch dictionary with ALL timeframes
            batch = {
@@ -1580,8 +1587,8 @@ class RealTrainingAdapter:
            logger.info(f"    Converted {len(training_data)} samples to {len(converted_batches)} training batches")
            # Use batch size of 1 to avoid OOM with large sequence lengths
-            # With 5 timeframes * 600 candles = 3000 sequence positions per sample,
+            # With 5 timeframes * 100 candles = 500 sequence positions per sample
-            # even batch_size=5 causes 15,000 positions which is too large for GPU
+            # Batch size of 1 ensures we don't exceed GPU memory (8GB)
            mini_batch_size = 1  # Process one sample at a time to avoid OOM
            def _combine_batches(batch_list: List[Dict[str, 'torch.Tensor']]) -> Dict[str, 'torch.Tensor']:
@@ -1623,6 +1630,10 @@ class RealTrainingAdapter:
                epoch_accuracy = 0.0
                num_batches = 0
                # Clear CUDA cache before epoch
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                for i, batch in enumerate(grouped_batches):
                    try:
                        # Determine if this is an accumulation step or optimizer step
@@ -1639,16 +1650,41 @@ class RealTrainingAdapter:
                            epoch_accuracy += batch_accuracy
                            num_batches += 1
-                            # Log first batch and every 10th batch for debugging
+                            # Log first batch and every 5th batch for debugging
-                            if (i + 1) == 1 or (i + 1) % 10 == 0:
+                            if (i + 1) == 1 or (i + 1) % 5 == 0:
                                logger.info(f"      Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Action Acc: {batch_accuracy:.2%}, Candle Acc: {batch_candle_accuracy:.2%}")
                        else:
                            logger.warning(f"   Batch {i + 1} returned None result - skipping")
-                        # Clear CUDA cache after optimizer step (not accumulation step)
+                        # CRITICAL FIX: Delete batch tensors immediately to free GPU memory
-                        if torch.cuda.is_available() and not is_accumulation_step:
+                        # This prevents memory accumulation during gradient accumulation
                        for key in list(batch.keys()):
                            if isinstance(batch[key], torch.Tensor):
                                del batch[key]
                        del batch
                        # CRITICAL: Clear CUDA cache after EVERY batch to prevent memory accumulation
                        # This is essential with large models and limited GPU memory
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        # After optimizer step (not accumulation), force garbage collection
                        if not is_accumulation_step:
                            import gc
                            gc.collect()
                            if torch.cuda.is_available():
                                torch.cuda.synchronize()
                    except torch.cuda.OutOfMemoryError as oom_error:
                        logger.error(f"   CUDA OOM in batch {i + 1}: {oom_error}")
                        # Aggressive memory cleanup on OOM
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                            torch.cuda.synchronize()
                        # Reset optimizer state to prevent corruption
                        trainer.optimizer.zero_grad(set_to_none=True)
                        logger.warning(f"   Skipping batch {i + 1} due to OOM, optimizer state reset")
                        continue
                    except Exception as e:
                        logger.error(f"   Error in batch {i + 1}: {e}")
                        import traceback
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -23,19 +23,19 @@ logger = logging.getLogger(__name__)
@dataclass
 class TradingTransformerConfig:
-    """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
+    """Configuration for trading transformer models - WITH PROPER MEMORY MANAGEMENT"""
-    # Model architecture - SCALED UP
+    # Model architecture - RESTORED to original size (memory leak fixed)
-    d_model: int = 1024         # Model dimension (2x increase)
+    d_model: int = 1024         # Model dimension
-    n_heads: int = 16           # Number of attention heads (2x increase)
+    n_heads: int = 16           # Number of attention heads
-    n_layers: int = 12          # Number of transformer layers (2x increase)
+    n_layers: int = 12          # Number of transformer layers
-    d_ff: int = 4096           # Feed-forward dimension (2x increase)
+    d_ff: int = 4096           # Feed-forward dimension
    dropout: float = 0.1        # Dropout rate
-    # Input dimensions - ENHANCED
+    # Input dimensions - RESTORED
-    seq_len: int = 150          # Sequence length for time series (1.5x increase)
+    seq_len: int = 200          # Sequence length for time series
-    cob_features: int = 100     # COB feature dimension (2x increase)
+    cob_features: int = 100     # COB feature dimension
-    tech_features: int = 40     # Technical indicator features (2x increase)
+    tech_features: int = 40     # Technical indicator features
-    market_features: int = 30   # Market microstructure features (2x increase)
+    market_features: int = 30   # Market microstructure features
    # Output configuration
    n_actions: int = 3          # BUY, SELL, HOLD
@@ -358,6 +358,7 @@ class AdvancedTradingTransformer(nn.Module):
        # SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
        # This is applied to each timeframe independently but uses SAME weights
        # RESTORED: Original dimensions (memory leak fixed)
        self.shared_pattern_encoder = nn.Sequential(
            nn.Linear(5, config.d_model // 4),  # 5 OHLCV -> 256
            nn.LayerNorm(config.d_model // 4),
@@ -1163,12 +1164,13 @@ class TradingTransformerTrainer:
            self.model.train()
            # Only zero gradients if not accumulating
            # Use set_to_none=True for better memory efficiency
            if not accumulate_gradients:
-                self.optimizer.zero_grad()
+                self.optimizer.zero_grad(set_to_none=True)
            # Move batch to device WITHOUT cloning to avoid version tracking issues
            # The detach().clone() was causing gradient computation errors
-            batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
+            batch = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v 
                    for k, v in batch.items()}
            # Use automatic mixed precision (FP16) for memory efficiency
@@ -1189,18 +1191,34 @@ class TradingTransformerTrainer:
                # Calculate losses
                action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
-                price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
+                
                # FIXED: Ensure shapes match for MSELoss
                price_pred = outputs['price_prediction']
                price_target = batch['future_prices']
                # Both should be [batch, 1], but ensure they match
                if price_pred.shape != price_target.shape:
                    logger.debug(f"Reshaping price target from {price_target.shape} to {price_pred.shape}")
                    price_target = price_target.view(price_pred.shape)
                price_loss = self.price_criterion(price_pred, price_target)
                # Start with base losses - avoid inplace operations on computation graph
                total_loss = action_loss + 0.1 * price_loss  # Weight auxiliary task
                # CRITICAL FIX: Scale loss for gradient accumulation
                # This prevents gradient explosion when accumulating over multiple batches
                if accumulate_gradients:
                    # Assume accumulation over 5 steps (should match training loop)
                    total_loss = total_loss / 5.0
            # Add confidence loss if available
            if 'confidence' in outputs and 'trade_success' in batch:
                # Both tensors should have shape [batch_size, 1] for BCELoss
                confidence_pred = outputs['confidence']
                trade_target = batch['trade_success'].float()
-                # Ensure both are 2D tensors [batch_size, 1]
+                # FIXED: Ensure both are 2D tensors [batch_size, 1]
                # Handle different input shapes robustly
                if confidence_pred.dim() == 0:
                    # Scalar -> [1, 1]
@@ -1208,6 +1226,9 @@ class TradingTransformerTrainer:
                elif confidence_pred.dim() == 1:
                    # [batch_size] -> [batch_size, 1]
                    confidence_pred = confidence_pred.unsqueeze(-1)
                elif confidence_pred.dim() == 3:
                    # [batch_size, seq_len, 1] -> [batch_size, 1] (take last timestep)
                    confidence_pred = confidence_pred[:, -1, :]
                if trade_target.dim() == 0:
                    # Scalar -> [1, 1]
@@ -1292,11 +1313,31 @@ class TradingTransformerTrainer:
                'learning_rate': self.scheduler.get_last_lr()[0]
            }
-            # Delete large tensors to free memory immediately
+            # CRITICAL: Delete large tensors to free memory immediately
            # This prevents memory accumulation across batches
            del outputs, total_loss, action_loss, price_loss, predictions, accuracy
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            return result
        except torch.cuda.OutOfMemoryError as oom_error:
            logger.error(f"CUDA OOM in train_step: {oom_error}")
            # Aggressive cleanup on OOM
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
            # Reset optimizer state to prevent corruption
            self.optimizer.zero_grad(set_to_none=True)
            # Return zero loss to continue training
            return {
                'total_loss': 0.0,
                'action_loss': 0.0,
                'price_loss': 0.0,
                'accuracy': 0.0,
                'candle_accuracy': 0.0,
                'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
            }
        except Exception as e:
            logger.error(f"Error in train_step: {e}", exc_info=True)
            # Clear any partial computations
@@ -1309,6 +1350,7 @@ class TradingTransformerTrainer:
                'action_loss': 0.0,
                'price_loss': 0.0,
                'accuracy': 0.0,
                'candle_accuracy': 0.0,
                'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
            }