From 76e3bb6a6129e53b87db44a2fba7d112f31059a2 Mon Sep 17 00:00:00 2001
From: Dobromir Popov <dobromir.popov@gateway.one>
Date: Thu, 6 Nov 2025 15:54:26 +0200
Subject: [PATCH] fix  T training memory usage (due for more improvement)

---
 ANNOTATE/core/real_training_adapter.py    |  44 +++++---
 NN/models/advanced_transformer_trading.py | 122 ++++++++++++++--------
 core/orchestrator.py                      |  20 +---
 3 files changed, 114 insertions(+), 72 deletions(-)

diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py
index 8ad35a9..59dcc64 100644
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -336,7 +336,9 @@ class RealTrainingAdapter:
             # Get training config
             training_config = test_case.get('training_config', {})
             timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
-            candles_per_timeframe = training_config.get('candles_per_timeframe', 600)  # 600 candles per batch
+            # Reduce sequence length to avoid OOM - 200 candles is more reasonable
+            # With 5 timeframes, this gives 1000 total positions vs 3000 with 600 candles
+            candles_per_timeframe = training_config.get('candles_per_timeframe', 200)  # 200 candles per batch
             
             # Determine secondary symbol based on primary symbol
             # ETH/SOL -> BTC, BTC -> ETH
@@ -1183,8 +1185,13 @@ class RealTrainingAdapter:
             timeframes = market_state.get('timeframes', {})
             secondary_timeframes = market_state.get('secondary_timeframes', {})
             
-            # Target sequence length for all timeframes
-            target_seq_len = 600
+            # Target sequence length - use actual data length (typically 200 candles)
+            # Find the first available timeframe to determine sequence length
+            target_seq_len = 200  # Default
+            for tf_data in timeframes.values():
+                if tf_data and 'close' in tf_data and len(tf_data['close']) > 0:
+                    target_seq_len = min(len(tf_data['close']), 200)  # Cap at 200 to avoid OOM
+                    break
             
             # Extract each timeframe (returns None if not available)
             price_data_1s = self._extract_timeframe_data(timeframes.get('1s', {}), target_seq_len) if '1s' in timeframes else None
@@ -1219,8 +1226,8 @@ class RealTrainingAdapter:
                 return None
             
             # Create placeholder COB data (zeros if not available)
-            # COB data shape: [1, 600, 100] to match new sequence length
-            cob_data = torch.zeros(1, 600, 100, dtype=torch.float32)
+            # COB data shape: [1, target_seq_len, 100] to match sequence length
+            cob_data = torch.zeros(1, target_seq_len, 100, dtype=torch.float32)
             
             # Create technical indicators from reference timeframe
             tech_features = []
@@ -1487,9 +1494,10 @@ class RealTrainingAdapter:
             
             logger.info(f"    Converted {len(training_data)} samples to {len(converted_batches)} training batches")
 
-            # Group single-sample batches into mini-batches for efficient training
-            # Small batch size (5) for better gradient updates with limited training data
-            mini_batch_size = 5  # Small batches work better with ~255 samples
+            # Use batch size of 1 to avoid OOM with large sequence lengths
+            # With 5 timeframes * 600 candles = 3000 sequence positions per sample,
+            # even batch_size=5 causes 15,000 positions which is too large for GPU
+            mini_batch_size = 1  # Process one sample at a time to avoid OOM
 
             def _combine_batches(batch_list: List[Dict[str, 'torch.Tensor']]) -> Dict[str, 'torch.Tensor']:
                 combined: Dict[str, 'torch.Tensor'] = {}
@@ -1521,7 +1529,10 @@ class RealTrainingAdapter:
 
             logger.info(f"    Grouped into {len(grouped_batches)} mini-batches (target size {mini_batch_size})")
 
-            # Train using train_step for each mini-batch
+            # Train using train_step for each mini-batch with gradient accumulation
+            # Accumulate gradients over multiple batches to simulate larger batch size
+            accumulation_steps = 5  # Accumulate 5 batches before optimizer step
+            
             for epoch in range(session.total_epochs):
                 epoch_loss = 0.0
                 epoch_accuracy = 0.0
@@ -1529,8 +1540,11 @@ class RealTrainingAdapter:
                 
                 for i, batch in enumerate(grouped_batches):
                     try:
+                        # Determine if this is an accumulation step or optimizer step
+                        is_accumulation_step = (i + 1) % accumulation_steps != 0
+                        
                         # Call the trainer's train_step method with proper batch format
-                        result = trainer.train_step(batch)
+                        result = trainer.train_step(batch, accumulate_gradients=is_accumulation_step)
                         
                         if result is not None:
                             batch_loss = result.get('total_loss', 0.0)
@@ -1539,14 +1553,14 @@ class RealTrainingAdapter:
                             epoch_accuracy += batch_accuracy
                             num_batches += 1
                             
-                            # Log first batch and every 100th batch for debugging
-                            if (i + 1) == 1 or (i + 1) % 100 == 0:
-                                logger.info(f"      Batch {i + 1}/{len(converted_batches)}, Loss: {batch_loss:.6f}, Accuracy: {batch_accuracy:.4f}")
+                            # Log first batch and every 10th batch for debugging
+                            if (i + 1) == 1 or (i + 1) % 10 == 0:
+                                logger.info(f"      Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Accuracy: {batch_accuracy:.4f}")
                         else:
                             logger.warning(f"   Batch {i + 1} returned None result - skipping")
                         
-                        # Clear CUDA cache periodically to prevent memory leak
-                        if torch.cuda.is_available() and (i + 1) % 5 == 0:
+                        # Clear CUDA cache after optimizer step (not accumulation step)
+                        if torch.cuda.is_available() and not is_accumulation_step:
                             torch.cuda.empty_cache()
                             
                     except Exception as e:
diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py
index b0a7c7c..f629ef9 100644
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -57,6 +57,9 @@ class TradingTransformerConfig:
     use_deep_attention: bool = True      # Deeper attention mechanisms
     use_residual_connections: bool = True # Enhanced residual connections
     use_layer_norm_variants: bool = True # Advanced normalization
+    
+    # Memory optimization
+    use_gradient_checkpointing: bool = True  # Trade compute for memory (saves ~30% memory)
 
 class PositionalEncoding(nn.Module):
     """Sinusoidal positional encoding for transformer"""
@@ -638,17 +641,17 @@ class AdvancedTradingTransformer(nn.Module):
         stacked_tfs = torch.stack(timeframe_encodings, dim=1)  # [batch, num_tfs, seq_len, d_model]
         num_tfs = len(timeframe_encodings)
         
-        # Reshape for cross-timeframe attention
-        # [batch, num_tfs, seq_len, d_model] -> [batch, num_tfs * seq_len, d_model]
-        cross_tf_input = stacked_tfs.reshape(batch_size, num_tfs * seq_len, self.config.d_model)
+        # MEMORY EFFICIENT: Process timeframes with shared weights
+        # Reshape to process all timeframes in parallel: [batch*num_tfs, seq_len, d_model]
+        # This avoids creating huge concatenated sequences while still processing efficiently
+        batched_tfs = stacked_tfs.reshape(batch_size * num_tfs, seq_len, self.config.d_model)
         
-        # Apply cross-timeframe attention layers
-        # This allows the model to see patterns ACROSS timeframes simultaneously
+        # Apply attention layers (shared across timeframes)
         for layer in self.cross_timeframe_layers:
-            cross_tf_input = layer(cross_tf_input)
+            batched_tfs = layer(batched_tfs)
         
-        # Reshape back: [batch, num_tfs * seq_len, d_model] -> [batch, num_tfs, seq_len, d_model]
-        cross_tf_output = cross_tf_input.reshape(batch_size, num_tfs, seq_len, self.config.d_model)
+        # Reshape back: [batch*num_tfs, seq_len, d_model] -> [batch, num_tfs, seq_len, d_model]
+        cross_tf_output = batched_tfs.reshape(batch_size, num_tfs, seq_len, self.config.d_model)
         
         # Average across timeframes to get unified representation
         # [batch, num_tfs, seq_len, d_model] -> [batch, seq_len, d_model]
@@ -706,10 +709,18 @@ class AdvancedTradingTransformer(nn.Module):
         else:
             x = self.pos_encoding(x.transpose(0, 1)).transpose(0, 1)
         
-        # Apply transformer layers
+        # Apply transformer layers with optional gradient checkpointing
         regime_probs_history = []
         for layer in self.layers:
-            layer_output = layer(x, mask)
+            if self.training and self.config.use_gradient_checkpointing:
+                # Use gradient checkpointing to save memory during training
+                # Trades compute for memory (recomputes activations during backward pass)
+                layer_output = torch.utils.checkpoint.checkpoint(
+                    layer, x, mask, use_reentrant=False
+                )
+            else:
+                layer_output = layer(x, mask)
+            
             x = layer_output['output']
             if layer_output['regime_probs'] is not None:
                 regime_probs_history.append(layer_output['regime_probs'])
@@ -1107,6 +1118,11 @@ class TradingTransformerTrainer:
         # Move model to device
         self.model.to(self.device)
         
+        # Mixed precision training disabled - causes dtype mismatches
+        # Can be re-enabled if needed, but requires careful dtype management
+        self.use_amp = False
+        self.scaler = None
+        
         # Optimizer with warmup
         self.optimizer = optim.AdamW(
             model.parameters(), 
@@ -1136,37 +1152,47 @@ class TradingTransformerTrainer:
             'learning_rates': []
         }
     
-    def train_step(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
-        """Single training step"""
+    def train_step(self, batch: Dict[str, torch.Tensor], accumulate_gradients: bool = False) -> Dict[str, float]:
+        """Single training step with optional gradient accumulation
+        
+        Args:
+            batch: Training batch
+            accumulate_gradients: If True, don't zero gradients or step optimizer (for gradient accumulation)
+        """
         try:
             self.model.train()
-            self.optimizer.zero_grad()
+            
+            # Only zero gradients if not accumulating
+            if not accumulate_gradients:
+                self.optimizer.zero_grad()
             
             # Move batch to device WITHOUT cloning to avoid version tracking issues
             # The detach().clone() was causing gradient computation errors
             batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
                     for k, v in batch.items()}
             
-            # Forward pass with multi-timeframe data
-            outputs = self.model(
-                price_data_1s=batch.get('price_data_1s'),
-                price_data_1m=batch.get('price_data_1m'),
-                price_data_1h=batch.get('price_data_1h'),
-                price_data_1d=batch.get('price_data_1d'),
-                btc_data_1m=batch.get('btc_data_1m'),
-                cob_data=batch['cob_data'],
-                tech_data=batch['tech_data'],
-                market_data=batch['market_data'],
-                position_state=batch.get('position_state'),
-                price_data=batch.get('price_data')  # Legacy fallback
-            )
-            
-            # Calculate losses
-            action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
-            price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
-            
-            # Start with base losses - avoid inplace operations on computation graph
-            total_loss = action_loss + 0.1 * price_loss  # Weight auxiliary task
+            # Use automatic mixed precision (FP16) for memory efficiency
+            with torch.cuda.amp.autocast(enabled=self.use_amp):
+                # Forward pass with multi-timeframe data
+                outputs = self.model(
+                    price_data_1s=batch.get('price_data_1s'),
+                    price_data_1m=batch.get('price_data_1m'),
+                    price_data_1h=batch.get('price_data_1h'),
+                    price_data_1d=batch.get('price_data_1d'),
+                    btc_data_1m=batch.get('btc_data_1m'),
+                    cob_data=batch['cob_data'],
+                    tech_data=batch['tech_data'],
+                    market_data=batch['market_data'],
+                    position_state=batch.get('position_state'),
+                    price_data=batch.get('price_data')  # Legacy fallback
+                )
+                
+                # Calculate losses
+                action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
+                price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
+                
+                # Start with base losses - avoid inplace operations on computation graph
+                total_loss = action_loss + 0.1 * price_loss  # Weight auxiliary task
             
             # Add confidence loss if available
             if 'confidence' in outputs and 'trade_success' in batch:
@@ -1199,9 +1225,12 @@ class TradingTransformerTrainer:
                 # Use addition instead of += to avoid inplace operation
                 total_loss = total_loss + 0.1 * confidence_loss
             
-            # Backward pass
+            # Backward pass with mixed precision scaling
             try:
-                total_loss.backward()
+                if self.use_amp:
+                    self.scaler.scale(total_loss).backward()
+                else:
+                    total_loss.backward()
             except RuntimeError as e:
                 if "inplace operation" in str(e):
                     logger.error(f"Inplace operation error during backward pass: {e}")
@@ -1216,12 +1245,23 @@ class TradingTransformerTrainer:
                 else:
                     raise
             
-            # Gradient clipping
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
-            
-            # Optimizer step
-            self.optimizer.step()
-            self.scheduler.step()
+            # Only clip gradients and step optimizer if not accumulating
+            if not accumulate_gradients:
+                if self.use_amp:
+                    # Unscale gradients before clipping
+                    self.scaler.unscale_(self.optimizer)
+                    # Gradient clipping
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
+                    # Optimizer step with scaling
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
+                else:
+                    # Gradient clipping
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
+                    # Optimizer step
+                    self.optimizer.step()
+                
+                self.scheduler.step()
             
             # Calculate accuracy without gradients
             with torch.no_grad():
diff --git a/core/orchestrator.py b/core/orchestrator.py
index b9e910e..c11e438 100644
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -836,22 +836,10 @@ class TradingOrchestrator:
             try:
                 from NN.models.dqn_agent import DQNAgent
 
-                # Determine actual state size from BaseDataInput
-                try:
-                    base_data = self.data_provider.build_base_data_input(self.symbol)
-                    if base_data:
-                        actual_state_size = len(base_data.get_feature_vector())
-                        logger.info(f"Detected actual state size: {actual_state_size}")
-                    else:
-                        actual_state_size = 7850  # Fallback based on error message
-                        logger.warning(
-                            f"Could not determine state size, using fallback: {actual_state_size}"
-                        )
-                except Exception as e:
-                    actual_state_size = 7850  # Fallback based on error message
-                    logger.warning(
-                        f"Error determining state size: {e}, using fallback: {actual_state_size}"
-                    )
+                # Use known state size instead of building data (which triggers massive API calls)
+                # The state size is determined by BaseDataInput structure and doesn't change
+                actual_state_size = 7850  # Known size from BaseDataInput.get_feature_vector()
+                logger.info(f"Using known state size: {actual_state_size}")
 
                 action_size = self.config.rl.get("action_space", 3)
                 self.rl_agent = DQNAgent(