diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py index af5dd95..8b23ee4 100644 --- a/ANNOTATE/core/real_training_adapter.py +++ b/ANNOTATE/core/real_training_adapter.py @@ -2530,11 +2530,14 @@ class RealTrainingAdapter: OPTIMIZATION: Batches are already on GPU and grouped for efficient processing. Each mini-batch contains 5 samples for better GPU utilization. - IMPORTANT: Yields the same batch objects across epochs (no copying). - The train_step method should not modify batch contents in-place. + IMPORTANT: Creates a shallow copy of batch dict to prevent in-place modifications + from affecting subsequent epochs. Tensors themselves are shared (not copied). """ for batch in grouped_batches: - yield batch + # Create shallow copy of batch dict to prevent modifications + # Tensors are shared (not cloned) for memory efficiency + batch_copy = {k: v for k, v in batch.items()} + yield batch_copy total_batches = len(grouped_batches) diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py index 9571eb0..770dba0 100644 --- a/NN/models/advanced_transformer_trading.py +++ b/NN/models/advanced_transformer_trading.py @@ -1355,27 +1355,29 @@ class TradingTransformerTrainer: needs_transfer = (v.device != self.device) break + # Always create a new batch_on_device dict to avoid modifying the input batch + # This is critical for multi-epoch training where batches are reused + batch_on_device = {} + if needs_transfer: - # Move batch to device - iterate over copy of keys to avoid modification during iteration - batch_gpu = {} - for k in list(batch.keys()): # Create list copy to avoid modification during iteration + # Move batch to device - create new tensors + for k in list(batch.keys()): v = batch[k] if isinstance(v, torch.Tensor): # Move to device (creates GPU copy) - batch_gpu[k] = v.to(self.device, non_blocking=True) + batch_on_device[k] = v.to(self.device, non_blocking=True) else: - batch_gpu[k] = v - - # Replace batch with GPU version - batch = batch_gpu - del batch_gpu - # else: batch is already on GPU, use it directly! + batch_on_device[k] = v + else: + # Batch is already on GPU, but still create a copy of the dict + # to avoid modifying the original batch dict + for k, v in batch.items(): + batch_on_device[k] = v # Ensure all batch tensors are on the same device as the model # This is critical to avoid device mismatch errors model_device = next(self.model.parameters()).device - batch_on_device = {} - for k, v in batch.items(): + for k, v in list(batch_on_device.items()): if isinstance(v, torch.Tensor): # Move tensor to model's device if it's not already there if v.device != model_device: diff --git a/QUICK_ACTION_SUMMARY.md b/QUICK_ACTION_SUMMARY.md new file mode 100644 index 0000000..bd6f91b --- /dev/null +++ b/QUICK_ACTION_SUMMARY.md @@ -0,0 +1,52 @@ +# Quick Action Summary - Training Effectiveness + +## What Was Wrong + +**Only epoch 1 was training, epochs 2-10 were skipping with 0.0 loss** + +The batch dictionaries were being modified in-place during training, so by epoch 2 the data was corrupted. + +## What Was Fixed + +### 1. Batch Generator (ANNOTATE/core/real_training_adapter.py) +```python +# ❌ BEFORE - Same batch object reused +for batch in grouped_batches: + yield batch + +# ✅ AFTER - New dict each time +for batch in grouped_batches: + batch_copy = {k: v for k, v in batch.items()} + yield batch_copy +``` + +### 2. Train Step (NN/models/advanced_transformer_trading.py) +```python +# ❌ BEFORE - Modifies input batch +batch = batch_gpu # Overwrites input + +# ✅ AFTER - Creates new dict +batch_on_device = {} # New dict, preserves input +for k, v in batch.items(): + batch_on_device[k] = v +``` + +## Expected Result + +- ✅ All 10 epochs should now train with real loss values +- ✅ No more "No timeframe data" warnings after epoch 1 +- ✅ Loss should decrease across epochs +- ✅ Model should actually learn + +## Still Need to Address + +1. **GPU utilization 0%** - Might be monitoring issue or single-sample batches +2. **Occasional inplace errors** - Caught and recovered, but losing training steps +3. **Single sample batches** - Need to accumulate more samples for better training + +## Test It + +Run your realtime training again and check if: +- Epoch 2 shows non-zero loss (not 0.000000) +- All epochs train successfully +- Loss decreases over time diff --git a/TRAINING_EFFECTIVENESS_FIXES.md b/TRAINING_EFFECTIVENESS_FIXES.md new file mode 100644 index 0000000..5984f04 --- /dev/null +++ b/TRAINING_EFFECTIVENESS_FIXES.md @@ -0,0 +1,133 @@ +# Training Effectiveness Fixes + +## Issues Identified + +From the logs, we found several critical issues preventing effective training: + +### 1. **Batch Corruption Across Epochs** ❌ +**Problem**: Only epoch 1 trains successfully, epochs 2-10 all show 0.0 loss +``` +Epoch 1/10, Loss: 1.688709, Accuracy: 0.00% (1 batches) ✅ Training works +Epoch 2/10, Loss: 0.000000, Accuracy: 0.00% (1 batches) ❌ No training +Epoch 3/10, Loss: 0.000000, Accuracy: 0.00% (1 batches) ❌ No training +... +WARNING - No timeframe data available for transformer forward pass +WARNING - No 'actions' key in batch - skipping this training step +``` + +**Root Cause**: +- Batches were being reused across epochs without copying +- `train_step()` was modifying the batch dict in-place +- By epoch 2, the batch tensors were corrupted/missing + +**Fix Applied**: +1. **Batch Generator**: Create shallow copy of batch dict for each yield + ```python + # Before: yield batch (same object reused) + # After: yield {k: v for k, v in batch.items()} (new dict each time) + ``` + +2. **Train Step**: Always create new `batch_on_device` dict instead of modifying input + ```python + # Before: batch = batch_gpu (modifies input) + # After: batch_on_device = {...} (new dict, preserves input) + ``` + +### 2. **Remaining Inplace Errors** ⚠️ +**Problem**: Still seeing occasional inplace operation errors (but recovering) +``` +ERROR - Inplace operation error: [torch.FloatTensor [128, 3]] version 4; expected version 2 +ERROR - Inplace operation error: [torch.FloatTensor [256, 256]] version 6; expected version 4 +``` + +**Root Cause**: +- `trend_target` tensor `[128, 3]` suggests batching is creating shared tensors +- Weight matrices `[256, 256]` being modified during backward pass + +**Current Status**: +- Errors are caught and training continues (returns 0.0 loss for that step) +- Not crashing, but losing training opportunities + +**Potential Additional Fixes** (if issues persist): +1. Ensure trend_target is detached after creation +2. Add `.detach()` to intermediate tensors before loss calculation +3. Use `torch.no_grad()` for any non-training operations + +### 3. **Zero GPU Utilization** 🔧 +**Problem**: GPU shows 0.0% utilization and 0.00GB memory +``` +GPU: AMD Radeon 8060S, Util: 0.0%, Mem: 0.00GB/46.97GB +``` + +**Possible Causes**: +1. **ROCm/AMD GPU monitoring issue**: The monitoring tool might not support AMD GPUs properly +2. **Computation too fast**: Single-sample batches complete before monitoring can measure +3. **CPU fallback**: Model might be running on CPU despite GPU being available + +**Recommendations**: +1. Check if model is actually on GPU: `next(model.parameters()).device` +2. Increase batch size for longer GPU operations +3. Use AMD-specific monitoring tools (rocm-smi) instead of nvidia-smi based tools + +### 4. **Single Sample Batches** 📊 +**Problem**: Training with only 1 sample per batch +``` +Total samples: 1 +Ready to train on 1 batches +``` + +**Impact**: +- Poor GPU utilization (GPUs are optimized for parallel processing) +- Noisy gradients (no batch averaging) +- Slower training convergence + +**Recommendations**: +1. Accumulate more training samples before starting training +2. Use gradient accumulation to simulate larger batches +3. Collect multiple pivot points before triggering training + +## Files Modified + +1. **ANNOTATE/core/real_training_adapter.py** + - Line 2527-2538: Batch generator now creates shallow copies + +2. **NN/models/advanced_transformer_trading.py** + - Lines 1350-1390: Train step creates new batch_on_device dict + +## Expected Improvements + +After these fixes: + +✅ **All epochs should train**: Epochs 2-10 will have real loss values, not 0.0 +✅ **Consistent training**: No more "No timeframe data" warnings after epoch 1 +✅ **Better convergence**: Loss should decrease across epochs +✅ **Fewer inplace errors**: Batch corruption was causing many of these + +## Testing Checklist + +Run realtime training and verify: + +- [ ] Epoch 1 trains successfully (already working) +- [ ] Epoch 2 shows non-zero loss (should be fixed now) +- [ ] Epochs 3-10 all train with real loss values +- [ ] No "No timeframe data" warnings after epoch 1 +- [ ] Loss decreases over epochs (model is learning) +- [ ] Accuracy increases over epochs +- [ ] Fewer inplace operation errors + +## Additional Recommendations + +### Short Term: +1. **Increase training samples**: Collect 10-20 pivot points before training +2. **Batch size**: Group samples into batches of 8-16 for better GPU utilization +3. **Learning rate**: May need adjustment if training is too slow/fast + +### Medium Term: +1. **Data augmentation**: Generate more training samples from each pivot +2. **Validation set**: Split data to monitor overfitting +3. **Early stopping**: Stop training when validation loss stops improving + +### Long Term: +1. **Distributed training**: Use multiple GPUs if available +2. **Mixed precision**: Already enabled, but verify it's working +3. **Model pruning**: Remove unused parameters to speed up training