gpu issues

2025-11-23 02:26:43 +02:00
parent 53ce4a355a
commit 1aded7325f
4 changed files with 69 additions and 31 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -17,7 +17,6 @@
                "ENABLE_REALTIME_CHARTS": "1",
                "ENABLE_NN_MODELS": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "preLaunchTask": "Kill Stale Processes"
        },
@@ -39,7 +38,6 @@
            "env": {
                "PYTHONUNBUFFERED": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
        {
@@ -61,7 +59,6 @@
                "PYTHONUNBUFFERED": "1",
                "CUDA_VISIBLE_DEVICES": "0",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
        {
@@ -84,7 +81,6 @@
            "env": {
                "PYTHONUNBUFFERED": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
        {
@@ -97,7 +93,6 @@
            "env": {
                "PYTHONUNBUFFERED": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
        {
@@ -112,7 +107,6 @@
                "FLASK_ENV": "development",
                "FLASK_DEBUG": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "cwd": "${workspaceFolder}",
            "preLaunchTask": "Kill Stale Processes"
@@ -129,7 +123,6 @@
                "COB_BTC_BUCKET_SIZE": "10",
                "COB_ETH_BUCKET_SIZE": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "preLaunchTask": "Kill Stale Processes"
        },
@@ -146,7 +139,6 @@
                "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:256",
                "ENABLE_REALTIME_RL": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "preLaunchTask": "Kill Stale Processes"
        },
@@ -165,7 +157,6 @@
                "COB_BTC_BUCKET_SIZE": "10",
                "COB_ETH_BUCKET_SIZE": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "preLaunchTask": "Kill Stale Processes"
        },
@@ -179,7 +170,6 @@
            "env": {
                "PYTHONUNBUFFERED": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
        {
@@ -192,7 +182,6 @@
            "env": {
                "PYTHONUNBUFFERED": "1",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            }
        },
     
@@ -214,7 +203,6 @@
                "COBY_WEBSOCKET_PORT": "8081",
                "COBY_LOG_LEVEL": "DEBUG",
                "HSA_OVERRIDE_GFX_VERSION": "11.0.0",
-                "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
            },
            "preLaunchTask": "Kill Stale Processes",
            "presentation": {
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -1227,6 +1227,13 @@ class TradingTransformerTrainer:
        try:
            self.model.train()
            
+            # Enable anomaly detection temporarily to debug inplace operation issues
+            # NOTE: This significantly slows down training (2-3x slower), use only for debugging
+            # Set to False once the issue is resolved
+            enable_anomaly_detection = False  # Set to True to debug gradient issues
+            if enable_anomaly_detection:
+                torch.autograd.set_detect_anomaly(True)
+            
            # GRADIENT ACCUMULATION: Determine if this is an accumulation step
            # If gradient_accumulation_steps is set, use automatic accumulation
            # Otherwise, fall back to manual accumulate_gradients flag
@@ -1445,8 +1452,25 @@ class TradingTransformerTrainer:
                else:
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
-                    # Optimizer step
-                    self.optimizer.step()
+                    # Optimizer step with error handling
+                    try:
+                        self.optimizer.step()
+                    except (KeyError, RuntimeError) as opt_error:
+                        logger.error(f"Optimizer step failed: {opt_error}. Resetting optimizer state.")
+                        # Zero gradients first to clear any stale gradients
+                        self.optimizer.zero_grad(set_to_none=True)
+                        # Reset optimizer to fix corrupted state
+                        self.optimizer = torch.optim.AdamW(
+                            self.model.parameters(),
+                            lr=self.config.learning_rate,
+                            weight_decay=self.config.weight_decay
+                        )
+                        # Zero gradients again after recreating optimizer
+                        self.optimizer.zero_grad(set_to_none=True)
+                        # Retry optimizer step with fresh state
+                        # Note: We need to recompute loss and backward pass, but for now just skip this step
+                        logger.warning("Skipping optimizer step after reset - gradients need to be recomputed")
+                        # Don't raise - allow training to continue with next batch
                
                self.scheduler.step()
                
@@ -1697,30 +1721,55 @@ class TradingTransformerTrainer:
            logger.warning(f"Error loading model state dict: {e}, continuing with partial load")
        
        # Load optimizer state (handle mismatched states gracefully)
+        # IMPORTANT: Always recreate optimizer if there's any issue to avoid corrupted state
+        optimizer_state_loaded = False
        try:
            optimizer_state = checkpoint.get('optimizer_state_dict')
            if optimizer_state:
-                try:
-                    # Try to load optimizer state
-                    self.optimizer.load_state_dict(optimizer_state)
-                except (KeyError, ValueError, RuntimeError) as e:
-                    logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.")
-                    # Recreate optimizer (same pattern as __init__)
-                    self.optimizer = torch.optim.AdamW(
-                        self.model.parameters(),
-                        lr=self.config.learning_rate,
-                        weight_decay=self.config.weight_decay
-                    )
+                # Validate optimizer state before loading
+                # Check if state dict has the expected structure
+                if 'state' in optimizer_state and 'param_groups' in optimizer_state:
+                    # Count parameters in saved state vs current model
+                    saved_param_count = len(optimizer_state.get('state', {}))
+                    current_param_count = sum(1 for _ in self.model.parameters() if _.requires_grad)
+                    
+                    if saved_param_count == current_param_count:
+                        try:
+                            # Try to load optimizer state
+                            self.optimizer.load_state_dict(optimizer_state)
+                            optimizer_state_loaded = True
+                            logger.info("Optimizer state loaded successfully")
+                        except (KeyError, ValueError, RuntimeError, TypeError) as e:
+                            logger.warning(f"Error loading optimizer state: {e}. State will be reset.")
+                            optimizer_state_loaded = False
+                    else:
+                        logger.warning(f"Optimizer state mismatch: {saved_param_count} saved params vs {current_param_count} current params. Resetting optimizer.")
+                        optimizer_state_loaded = False
+                else:
+                    logger.warning("Invalid optimizer state structure in checkpoint. Resetting optimizer.")
+                    optimizer_state_loaded = False
            else:
-                logger.warning("No optimizer state found in checkpoint. Using fresh optimizer.")
+                logger.info("No optimizer state found in checkpoint. Using fresh optimizer.")
+                optimizer_state_loaded = False
        except Exception as e:
            logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.")
-            # Recreate optimizer (same pattern as __init__)
+            optimizer_state_loaded = False
+        
+        # Always recreate optimizer if state loading failed
+        if not optimizer_state_loaded:
+            logger.info("Creating fresh optimizer (checkpoint state was invalid or missing)")
            self.optimizer = torch.optim.AdamW(
                self.model.parameters(),
                lr=self.config.learning_rate,
                weight_decay=self.config.weight_decay
            )
+            # Also recreate scheduler to match
+            self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
+                self.optimizer,
+                max_lr=self.config.learning_rate,
+                total_steps=10000,
+                pct_start=0.1
+            )
        
        # Load scheduler state
        try:
--- a/run_experimental_gpu.sh
+++ b/run_experimental_gpu.sh
@@ -3,7 +3,7 @@
 # This tells ROCm to treat gfx1151 as gfx1100
 export HSA_OVERRIDE_GFX_VERSION=11.0.0
 export AMD_SERIALIZE_KERNEL=3  # Enable debugging
-export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1  # Enable Flash Efficient attention
+# DISABLED: export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1  # Was causing inplace operation errors
 cd /mnt/shared/DEV/repos/d-popov.com/gogo2
 source venv/bin/activate
 python ANNOTATE/web/app.py "$@"
--- a/start_with_gpu.sh
+++ b/start_with_gpu.sh
@@ -7,11 +7,12 @@ export HSA_OVERRIDE_GFX_VERSION=11.0.0
 # Activate virtual environment
 source venv/bin/activate

-# Enable experimental Flash Efficient attention for AMD GPU
-export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
+# DISABLED: Experimental Flash Efficient attention for AMD GPU
+# export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
+# This was causing inplace operation errors during backward pass

 echo "GPU Compatibility: HSA_OVERRIDE_GFX_VERSION=11.0.0"
-echo "Experimental Features: TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1"
+echo "Experimental Features: DISABLED (was causing gradient computation errors)"
 echo "Virtual environment: $(which python)"
 echo ""
 echo "Starting application..."