diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py
index caac6b8..b2f81c6 100644
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -337,8 +337,8 @@ class RealTrainingAdapter:
             # Get training config
             training_config = test_case.get('training_config', {})
             timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
-            # Reduce sequence length to avoid OOM - 200 candles is more reasonable
-            # With 5 timeframes, this gives 1000 total positions vs 3000 with 600 candles
+            # RESTORED: 200 candles per timeframe (memory leak fixed)
+            # With 5 timeframes * 200 candles = 1000 total positions
             candles_per_timeframe = training_config.get('candles_per_timeframe', 200)  # 200 candles per batch
             
             # Determine secondary symbol based on primary symbol
@@ -586,20 +586,24 @@ class RealTrainingAdapter:
                     logger.info(f"   Test case {i+1}: Added {len(hold_samples)} HOLD samples (during position)")
                 
                 # Create EXIT sample (where model SHOULD exit trade)
-                exit_timestamp = test_case.get('annotation_metadata', {}).get('exit_timestamp')
-                if exit_timestamp:
+                # Exit info is in expected_outcome, not annotation_metadata
+                exit_price = expected_outcome.get('exit_price')
+                if exit_price:
+                    # For now, use same market state (TODO: fetch market state at exit time)
+                    # The model will learn to exit based on profit_loss_pct and position state
                     exit_sample = {
-                        'market_state': market_state,  # TODO: Get market state at exit time
+                        'market_state': market_state,  # Using entry market state as proxy
                         'action': 'CLOSE',
                         'direction': expected_outcome.get('direction'),
                         'profit_loss_pct': expected_outcome.get('profit_loss_pct'),
                         'entry_price': expected_outcome.get('entry_price'),
-                        'exit_price': expected_outcome.get('exit_price'),
-                        'timestamp': exit_timestamp,
-                        'label': 'EXIT'  # Exit signal
+                        'exit_price': exit_price,
+                        'timestamp': test_case.get('timestamp'),  # Entry timestamp (exit time not stored separately)
+                        'label': 'EXIT',  # Exit signal
+                        'in_position': True  # Model is in position when deciding to exit
                     }
                     training_data.append(exit_sample)
-                    logger.info(f"   Test case {i+1}: EXIT sample @ {exit_sample['exit_price']} ({exit_sample['profit_loss_pct']:.2f}%)")
+                    logger.info(f"   Test case {i+1}: EXIT sample @ {exit_price} ({expected_outcome.get('profit_loss_pct', 0):.2f}%)")
                 
                 # Create NEGATIVE samples (where model should NOT trade)
                 # These are candles before and after the signal (±15 candles)
@@ -1186,12 +1190,13 @@ class RealTrainingAdapter:
             timeframes = market_state.get('timeframes', {})
             secondary_timeframes = market_state.get('secondary_timeframes', {})
             
-            # Target sequence length - use actual data length (typically 200 candles)
-            # Find the first available timeframe to determine sequence length
-            target_seq_len = 200  # Default
+            # Target sequence length - RESTORED to 200 (memory leak fixed)
+            # With 5 timeframes * 200 candles = 1000 sequence positions
+            # Memory management fixes allow full sequence length
+            target_seq_len = 200  # Restored to original
             for tf_data in timeframes.values():
                 if tf_data and 'close' in tf_data and len(tf_data['close']) > 0:
-                    target_seq_len = min(len(tf_data['close']), 200)  # Cap at 200 to avoid OOM
+                    target_seq_len = min(len(tf_data['close']), 200)  # Cap at 200
                     break
             
             # Extract each timeframe (returns None if not available)
@@ -1409,12 +1414,14 @@ class RealTrainingAdapter:
                 # For HOLD samples, expect no price change
                 future_price_ratio = 0.0
             
-            future_prices = torch.tensor([future_price_ratio], dtype=torch.float32)
+            # FIXED: Shape must be [batch, 1] to match price_head output
+            future_prices = torch.tensor([[future_price_ratio]], dtype=torch.float32)  # [1, 1]
             
             # Trade success (1.0 if profitable, 0.0 otherwise)
-            # Shape must be [batch_size, 1] to match confidence head output
+            # Shape must be [batch_size, 1] to match confidence head output [batch, 1]
             profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
-            trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)
+            # FIXED: Ensure shape is [1, 1] not [1] to match BCELoss requirements
+            trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)  # [1, 1]
             
             # Return batch dictionary with ALL timeframes
             batch = {
@@ -1580,8 +1587,8 @@ class RealTrainingAdapter:
             logger.info(f"    Converted {len(training_data)} samples to {len(converted_batches)} training batches")
 
             # Use batch size of 1 to avoid OOM with large sequence lengths
-            # With 5 timeframes * 600 candles = 3000 sequence positions per sample,
-            # even batch_size=5 causes 15,000 positions which is too large for GPU
+            # With 5 timeframes * 100 candles = 500 sequence positions per sample
+            # Batch size of 1 ensures we don't exceed GPU memory (8GB)
             mini_batch_size = 1  # Process one sample at a time to avoid OOM
 
             def _combine_batches(batch_list: List[Dict[str, 'torch.Tensor']]) -> Dict[str, 'torch.Tensor']:
@@ -1623,6 +1630,10 @@ class RealTrainingAdapter:
                 epoch_accuracy = 0.0
                 num_batches = 0
                 
+                # Clear CUDA cache before epoch
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                
                 for i, batch in enumerate(grouped_batches):
                     try:
                         # Determine if this is an accumulation step or optimizer step
@@ -1639,16 +1650,41 @@ class RealTrainingAdapter:
                             epoch_accuracy += batch_accuracy
                             num_batches += 1
                             
-                            # Log first batch and every 10th batch for debugging
-                            if (i + 1) == 1 or (i + 1) % 10 == 0:
+                            # Log first batch and every 5th batch for debugging
+                            if (i + 1) == 1 or (i + 1) % 5 == 0:
                                 logger.info(f"      Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Action Acc: {batch_accuracy:.2%}, Candle Acc: {batch_candle_accuracy:.2%}")
                         else:
                             logger.warning(f"   Batch {i + 1} returned None result - skipping")
                         
-                        # Clear CUDA cache after optimizer step (not accumulation step)
-                        if torch.cuda.is_available() and not is_accumulation_step:
+                        # CRITICAL FIX: Delete batch tensors immediately to free GPU memory
+                        # This prevents memory accumulation during gradient accumulation
+                        for key in list(batch.keys()):
+                            if isinstance(batch[key], torch.Tensor):
+                                del batch[key]
+                        del batch
+                        
+                        # CRITICAL: Clear CUDA cache after EVERY batch to prevent memory accumulation
+                        # This is essential with large models and limited GPU memory
+                        if torch.cuda.is_available():
                             torch.cuda.empty_cache()
+                        
+                        # After optimizer step (not accumulation), force garbage collection
+                        if not is_accumulation_step:
+                            import gc
+                            gc.collect()
+                            if torch.cuda.is_available():
+                                torch.cuda.synchronize()
                             
+                    except torch.cuda.OutOfMemoryError as oom_error:
+                        logger.error(f"   CUDA OOM in batch {i + 1}: {oom_error}")
+                        # Aggressive memory cleanup on OOM
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                            torch.cuda.synchronize()
+                        # Reset optimizer state to prevent corruption
+                        trainer.optimizer.zero_grad(set_to_none=True)
+                        logger.warning(f"   Skipping batch {i + 1} due to OOM, optimizer state reset")
+                        continue
                     except Exception as e:
                         logger.error(f"   Error in batch {i + 1}: {e}")
                         import traceback
diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py
index 90d2efe..6eadb63 100644
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -23,19 +23,19 @@ logger = logging.getLogger(__name__)
 
 @dataclass
 class TradingTransformerConfig:
-    """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
-    # Model architecture - SCALED UP
-    d_model: int = 1024         # Model dimension (2x increase)
-    n_heads: int = 16           # Number of attention heads (2x increase)
-    n_layers: int = 12          # Number of transformer layers (2x increase)
-    d_ff: int = 4096           # Feed-forward dimension (2x increase)
+    """Configuration for trading transformer models - WITH PROPER MEMORY MANAGEMENT"""
+    # Model architecture - RESTORED to original size (memory leak fixed)
+    d_model: int = 1024         # Model dimension
+    n_heads: int = 16           # Number of attention heads
+    n_layers: int = 12          # Number of transformer layers
+    d_ff: int = 4096           # Feed-forward dimension
     dropout: float = 0.1        # Dropout rate
     
-    # Input dimensions - ENHANCED
-    seq_len: int = 150          # Sequence length for time series (1.5x increase)
-    cob_features: int = 100     # COB feature dimension (2x increase)
-    tech_features: int = 40     # Technical indicator features (2x increase)
-    market_features: int = 30   # Market microstructure features (2x increase)
+    # Input dimensions - RESTORED
+    seq_len: int = 200          # Sequence length for time series
+    cob_features: int = 100     # COB feature dimension
+    tech_features: int = 40     # Technical indicator features
+    market_features: int = 30   # Market microstructure features
     
     # Output configuration
     n_actions: int = 3          # BUY, SELL, HOLD
@@ -358,6 +358,7 @@ class AdvancedTradingTransformer(nn.Module):
         
         # SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
         # This is applied to each timeframe independently but uses SAME weights
+        # RESTORED: Original dimensions (memory leak fixed)
         self.shared_pattern_encoder = nn.Sequential(
             nn.Linear(5, config.d_model // 4),  # 5 OHLCV -> 256
             nn.LayerNorm(config.d_model // 4),
@@ -1163,12 +1164,13 @@ class TradingTransformerTrainer:
             self.model.train()
             
             # Only zero gradients if not accumulating
+            # Use set_to_none=True for better memory efficiency
             if not accumulate_gradients:
-                self.optimizer.zero_grad()
+                self.optimizer.zero_grad(set_to_none=True)
             
             # Move batch to device WITHOUT cloning to avoid version tracking issues
             # The detach().clone() was causing gradient computation errors
-            batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
+            batch = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v 
                     for k, v in batch.items()}
             
             # Use automatic mixed precision (FP16) for memory efficiency
@@ -1189,10 +1191,26 @@ class TradingTransformerTrainer:
                 
                 # Calculate losses
                 action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
-                price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
+                
+                # FIXED: Ensure shapes match for MSELoss
+                price_pred = outputs['price_prediction']
+                price_target = batch['future_prices']
+                
+                # Both should be [batch, 1], but ensure they match
+                if price_pred.shape != price_target.shape:
+                    logger.debug(f"Reshaping price target from {price_target.shape} to {price_pred.shape}")
+                    price_target = price_target.view(price_pred.shape)
+                
+                price_loss = self.price_criterion(price_pred, price_target)
                 
                 # Start with base losses - avoid inplace operations on computation graph
                 total_loss = action_loss + 0.1 * price_loss  # Weight auxiliary task
+                
+                # CRITICAL FIX: Scale loss for gradient accumulation
+                # This prevents gradient explosion when accumulating over multiple batches
+                if accumulate_gradients:
+                    # Assume accumulation over 5 steps (should match training loop)
+                    total_loss = total_loss / 5.0
             
             # Add confidence loss if available
             if 'confidence' in outputs and 'trade_success' in batch:
@@ -1200,7 +1218,7 @@ class TradingTransformerTrainer:
                 confidence_pred = outputs['confidence']
                 trade_target = batch['trade_success'].float()
                 
-                # Ensure both are 2D tensors [batch_size, 1]
+                # FIXED: Ensure both are 2D tensors [batch_size, 1]
                 # Handle different input shapes robustly
                 if confidence_pred.dim() == 0:
                     # Scalar -> [1, 1]
@@ -1208,6 +1226,9 @@ class TradingTransformerTrainer:
                 elif confidence_pred.dim() == 1:
                     # [batch_size] -> [batch_size, 1]
                     confidence_pred = confidence_pred.unsqueeze(-1)
+                elif confidence_pred.dim() == 3:
+                    # [batch_size, seq_len, 1] -> [batch_size, 1] (take last timestep)
+                    confidence_pred = confidence_pred[:, -1, :]
                 
                 if trade_target.dim() == 0:
                     # Scalar -> [1, 1]
@@ -1292,11 +1313,31 @@ class TradingTransformerTrainer:
                 'learning_rate': self.scheduler.get_last_lr()[0]
             }
             
-            # Delete large tensors to free memory immediately
+            # CRITICAL: Delete large tensors to free memory immediately
+            # This prevents memory accumulation across batches
             del outputs, total_loss, action_loss, price_loss, predictions, accuracy
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             
             return result
             
+        except torch.cuda.OutOfMemoryError as oom_error:
+            logger.error(f"CUDA OOM in train_step: {oom_error}")
+            # Aggressive cleanup on OOM
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+            # Reset optimizer state to prevent corruption
+            self.optimizer.zero_grad(set_to_none=True)
+            # Return zero loss to continue training
+            return {
+                'total_loss': 0.0,
+                'action_loss': 0.0,
+                'price_loss': 0.0,
+                'accuracy': 0.0,
+                'candle_accuracy': 0.0,
+                'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
+            }
         except Exception as e:
             logger.error(f"Error in train_step: {e}", exc_info=True)
             # Clear any partial computations
@@ -1309,6 +1350,7 @@ class TradingTransformerTrainer:
                 'action_loss': 0.0,
                 'price_loss': 0.0,
                 'accuracy': 0.0,
+                'candle_accuracy': 0.0,
                 'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
             }