diff --git a/.vscode/launch.json b/.vscode/launch.json index bb83115..9055907 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,7 +17,6 @@ "ENABLE_REALTIME_CHARTS": "1", "ENABLE_NN_MODELS": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "preLaunchTask": "Kill Stale Processes" }, @@ -39,7 +38,6 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, { @@ -61,7 +59,6 @@ "PYTHONUNBUFFERED": "1", "CUDA_VISIBLE_DEVICES": "0", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, { @@ -84,7 +81,6 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, { @@ -97,7 +93,6 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, { @@ -112,7 +107,6 @@ "FLASK_ENV": "development", "FLASK_DEBUG": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "cwd": "${workspaceFolder}", "preLaunchTask": "Kill Stale Processes" @@ -129,7 +123,6 @@ "COB_BTC_BUCKET_SIZE": "10", "COB_ETH_BUCKET_SIZE": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "preLaunchTask": "Kill Stale Processes" }, @@ -146,7 +139,6 @@ "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:256", "ENABLE_REALTIME_RL": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "preLaunchTask": "Kill Stale Processes" }, @@ -165,7 +157,6 @@ "COB_BTC_BUCKET_SIZE": "10", "COB_ETH_BUCKET_SIZE": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "preLaunchTask": "Kill Stale Processes" }, @@ -179,7 +170,6 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, { @@ -192,7 +182,6 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" } }, @@ -214,7 +203,6 @@ "COBY_WEBSOCKET_PORT": "8081", "COBY_LOG_LEVEL": "DEBUG", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", - "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1" }, "preLaunchTask": "Kill Stale Processes", "presentation": { diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py index 2e00038..63dedfe 100644 --- a/NN/models/advanced_transformer_trading.py +++ b/NN/models/advanced_transformer_trading.py @@ -1227,6 +1227,13 @@ class TradingTransformerTrainer: try: self.model.train() + # Enable anomaly detection temporarily to debug inplace operation issues + # NOTE: This significantly slows down training (2-3x slower), use only for debugging + # Set to False once the issue is resolved + enable_anomaly_detection = False # Set to True to debug gradient issues + if enable_anomaly_detection: + torch.autograd.set_detect_anomaly(True) + # GRADIENT ACCUMULATION: Determine if this is an accumulation step # If gradient_accumulation_steps is set, use automatic accumulation # Otherwise, fall back to manual accumulate_gradients flag @@ -1445,8 +1452,25 @@ class TradingTransformerTrainer: else: # Gradient clipping torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm) - # Optimizer step - self.optimizer.step() + # Optimizer step with error handling + try: + self.optimizer.step() + except (KeyError, RuntimeError) as opt_error: + logger.error(f"Optimizer step failed: {opt_error}. Resetting optimizer state.") + # Zero gradients first to clear any stale gradients + self.optimizer.zero_grad(set_to_none=True) + # Reset optimizer to fix corrupted state + self.optimizer = torch.optim.AdamW( + self.model.parameters(), + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay + ) + # Zero gradients again after recreating optimizer + self.optimizer.zero_grad(set_to_none=True) + # Retry optimizer step with fresh state + # Note: We need to recompute loss and backward pass, but for now just skip this step + logger.warning("Skipping optimizer step after reset - gradients need to be recomputed") + # Don't raise - allow training to continue with next batch self.scheduler.step() @@ -1697,30 +1721,55 @@ class TradingTransformerTrainer: logger.warning(f"Error loading model state dict: {e}, continuing with partial load") # Load optimizer state (handle mismatched states gracefully) + # IMPORTANT: Always recreate optimizer if there's any issue to avoid corrupted state + optimizer_state_loaded = False try: optimizer_state = checkpoint.get('optimizer_state_dict') if optimizer_state: - try: - # Try to load optimizer state - self.optimizer.load_state_dict(optimizer_state) - except (KeyError, ValueError, RuntimeError) as e: - logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.") - # Recreate optimizer (same pattern as __init__) - self.optimizer = torch.optim.AdamW( - self.model.parameters(), - lr=self.config.learning_rate, - weight_decay=self.config.weight_decay - ) + # Validate optimizer state before loading + # Check if state dict has the expected structure + if 'state' in optimizer_state and 'param_groups' in optimizer_state: + # Count parameters in saved state vs current model + saved_param_count = len(optimizer_state.get('state', {})) + current_param_count = sum(1 for _ in self.model.parameters() if _.requires_grad) + + if saved_param_count == current_param_count: + try: + # Try to load optimizer state + self.optimizer.load_state_dict(optimizer_state) + optimizer_state_loaded = True + logger.info("Optimizer state loaded successfully") + except (KeyError, ValueError, RuntimeError, TypeError) as e: + logger.warning(f"Error loading optimizer state: {e}. State will be reset.") + optimizer_state_loaded = False + else: + logger.warning(f"Optimizer state mismatch: {saved_param_count} saved params vs {current_param_count} current params. Resetting optimizer.") + optimizer_state_loaded = False + else: + logger.warning("Invalid optimizer state structure in checkpoint. Resetting optimizer.") + optimizer_state_loaded = False else: - logger.warning("No optimizer state found in checkpoint. Using fresh optimizer.") + logger.info("No optimizer state found in checkpoint. Using fresh optimizer.") + optimizer_state_loaded = False except Exception as e: logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.") - # Recreate optimizer (same pattern as __init__) + optimizer_state_loaded = False + + # Always recreate optimizer if state loading failed + if not optimizer_state_loaded: + logger.info("Creating fresh optimizer (checkpoint state was invalid or missing)") self.optimizer = torch.optim.AdamW( self.model.parameters(), lr=self.config.learning_rate, weight_decay=self.config.weight_decay ) + # Also recreate scheduler to match + self.scheduler = torch.optim.lr_scheduler.OneCycleLR( + self.optimizer, + max_lr=self.config.learning_rate, + total_steps=10000, + pct_start=0.1 + ) # Load scheduler state try: diff --git a/run_experimental_gpu.sh b/run_experimental_gpu.sh index 70d522a..eb5fb01 100644 --- a/run_experimental_gpu.sh +++ b/run_experimental_gpu.sh @@ -3,7 +3,7 @@ # This tells ROCm to treat gfx1151 as gfx1100 export HSA_OVERRIDE_GFX_VERSION=11.0.0 export AMD_SERIALIZE_KERNEL=3 # Enable debugging -export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 # Enable Flash Efficient attention +# DISABLED: export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 # Was causing inplace operation errors cd /mnt/shared/DEV/repos/d-popov.com/gogo2 source venv/bin/activate python ANNOTATE/web/app.py "$@" diff --git a/start_with_gpu.sh b/start_with_gpu.sh index 12c258a..e0610a0 100644 --- a/start_with_gpu.sh +++ b/start_with_gpu.sh @@ -7,11 +7,12 @@ export HSA_OVERRIDE_GFX_VERSION=11.0.0 # Activate virtual environment source venv/bin/activate -# Enable experimental Flash Efficient attention for AMD GPU -export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 +# DISABLED: Experimental Flash Efficient attention for AMD GPU +# export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 +# This was causing inplace operation errors during backward pass echo "GPU Compatibility: HSA_OVERRIDE_GFX_VERSION=11.0.0" -echo "Experimental Features: TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1" +echo "Experimental Features: DISABLED (was causing gradient computation errors)" echo "Virtual environment: $(which python)" echo "" echo "Starting application..."