diff --git a/.vscode/launch.json b/.vscode/launch.json index 9055907..1820bcd 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,6 +17,7 @@ "ENABLE_REALTIME_CHARTS": "1", "ENABLE_NN_MODELS": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "preLaunchTask": "Kill Stale Processes" }, @@ -38,6 +39,7 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, { @@ -59,6 +61,7 @@ "PYTHONUNBUFFERED": "1", "CUDA_VISIBLE_DEVICES": "0", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, { @@ -81,6 +84,7 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, { @@ -93,6 +97,7 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, { @@ -107,6 +112,7 @@ "FLASK_ENV": "development", "FLASK_DEBUG": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "cwd": "${workspaceFolder}", "preLaunchTask": "Kill Stale Processes" @@ -123,6 +129,7 @@ "COB_BTC_BUCKET_SIZE": "10", "COB_ETH_BUCKET_SIZE": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "preLaunchTask": "Kill Stale Processes" }, @@ -139,6 +146,7 @@ "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:256", "ENABLE_REALTIME_RL": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "preLaunchTask": "Kill Stale Processes" }, @@ -157,6 +165,7 @@ "COB_BTC_BUCKET_SIZE": "10", "COB_ETH_BUCKET_SIZE": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "preLaunchTask": "Kill Stale Processes" }, @@ -170,6 +179,7 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, { @@ -182,6 +192,7 @@ "env": { "PYTHONUNBUFFERED": "1", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" } }, @@ -203,6 +214,7 @@ "COBY_WEBSOCKET_PORT": "8081", "COBY_LOG_LEVEL": "DEBUG", "HSA_OVERRIDE_GFX_VERSION": "11.0.0", + "TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "0" }, "preLaunchTask": "Kill Stale Processes", "presentation": { diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py index 63dedfe..e3c0aec 100644 --- a/NN/models/advanced_transformer_trading.py +++ b/NN/models/advanced_transformer_trading.py @@ -1049,20 +1049,24 @@ class TradingTransformerTrainer: def __init__(self, model: AdvancedTradingTransformer, config: TradingTransformerConfig): self.model = model self.config = config - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # Determine device from config or auto-detect + self.device = self._get_device_from_config() # Move model to device self.model.to(self.device) logger.info(f"Model moved to device: {self.device}") # Log GPU info if available - if torch.cuda.is_available(): + if self.device.type == 'cuda' and torch.cuda.is_available(): logger.info(f" GPU: {torch.cuda.get_device_name(0)}") logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") + elif self.device.type == 'cpu': + logger.info(" Using CPU (GPU disabled or unavailable)") # MEMORY OPTIMIZATION: Enable gradient checkpointing if configured # This trades 20% compute for 30-40% memory savings - if config.use_gradient_checkpointing: + if self.config.use_gradient_checkpointing: logger.info("Enabling gradient checkpointing for memory efficiency") self._enable_gradient_checkpointing() @@ -1077,11 +1081,82 @@ class TradingTransformerTrainer: # Optimizer with warmup self.optimizer = optim.AdamW( - model.parameters(), - lr=config.learning_rate, - weight_decay=config.weight_decay + self.model.parameters(), + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay ) + # Learning rate scheduler + self.scheduler = optim.lr_scheduler.OneCycleLR( + self.optimizer, + max_lr=self.config.learning_rate, + total_steps=10000, # Will be updated based on training data + pct_start=0.1 + ) + + # Loss functions with class weights + # Pivot-based training: BUY at L pivots, SELL at H pivots (naturally balanced) + # Weights: [HOLD=0, BUY=1, SELL=2] - equal weighting for pivot-based trades + class_weights = torch.tensor([0.5, 1.0, 1.0], dtype=torch.float32, device=self.device) + self.action_criterion = nn.CrossEntropyLoss(weight=class_weights) + self.price_criterion = nn.MSELoss() + self.confidence_criterion = nn.BCELoss() + + # Training history + self.training_history = { + 'train_loss': [], + 'val_loss': [], + 'train_accuracy': [], + 'val_accuracy': [], + 'epochs': [] + } + + def _get_device_from_config(self) -> torch.device: + """Get device from config.yaml or auto-detect""" + try: + # Try to load config + from core.config import get_config + config = get_config() + gpu_config = config._config.get('gpu', {}) + + device_setting = gpu_config.get('device', 'auto') + fallback_to_cpu = gpu_config.get('fallback_to_cpu', True) + gpu_enabled = gpu_config.get('enabled', True) + + # If GPU is disabled in config, use CPU + if not gpu_enabled: + logger.info("GPU disabled in config.yaml, using CPU") + return torch.device('cpu') + + # Handle device selection + if device_setting == 'cpu': + logger.info("Device set to CPU in config.yaml") + return torch.device('cpu') + elif device_setting == 'cuda' or device_setting == 'auto': + # Try GPU first + if torch.cuda.is_available(): + logger.info("Using GPU (CUDA available)") + return torch.device('cuda') + else: + if fallback_to_cpu: + logger.warning("CUDA not available, falling back to CPU") + return torch.device('cpu') + else: + raise RuntimeError("CUDA not available and fallback_to_cpu is False") + else: + logger.warning(f"Unknown device setting '{device_setting}', using auto-detection") + return torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + except Exception as e: + logger.warning(f"Error reading device config: {e}, using auto-detection") + # Fallback to auto-detection + return torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + def _enable_gradient_checkpointing(self): + """Enable gradient checkpointing for memory efficiency""" + # This is handled by the model itself if use_gradient_checkpointing is True + pass + # Learning rate scheduler self.scheduler = optim.lr_scheduler.OneCycleLR( self.optimizer, @@ -1229,8 +1304,8 @@ class TradingTransformerTrainer: # Enable anomaly detection temporarily to debug inplace operation issues # NOTE: This significantly slows down training (2-3x slower), use only for debugging - # Set to False once the issue is resolved - enable_anomaly_detection = False # Set to True to debug gradient issues + # Set to True to find exact inplace operation causing errors + enable_anomaly_detection = True # TEMPORARILY ENABLED to find inplace operations if enable_anomaly_detection: torch.autograd.set_detect_anomaly(True) @@ -1276,30 +1351,54 @@ class TradingTransformerTrainer: del batch_gpu # else: batch is already on GPU, use it directly! + # Ensure all batch tensors are on the same device as the model + # This is critical to avoid device mismatch errors + model_device = next(self.model.parameters()).device + batch_on_device = {} + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + # Move tensor to model's device if it's not already there + if v.device != model_device: + batch_on_device[k] = v.to(model_device, non_blocking=True) + else: + batch_on_device[k] = v + else: + batch_on_device[k] = v + + # Also ensure model is on the correct device (in case it was moved elsewhere) + if model_device != self.device: + logger.warning(f"Model device ({model_device}) doesn't match trainer device ({self.device}). Moving model to {self.device}") + self.model.to(self.device) + model_device = self.device + # Re-move batch to correct device + for k, v in batch_on_device.items(): + if isinstance(v, torch.Tensor): + batch_on_device[k] = v.to(self.device, non_blocking=True) + # Use automatic mixed precision (FP16) for memory efficiency # Support both CUDA and ROCm (AMD) devices device_type = 'cuda' if self.device.type == 'cuda' else 'cpu' with torch.amp.autocast(device_type, enabled=self.use_amp and device_type != 'cpu'): # Forward pass with multi-timeframe data outputs = self.model( - price_data_1s=batch.get('price_data_1s'), - price_data_1m=batch.get('price_data_1m'), - price_data_1h=batch.get('price_data_1h'), - price_data_1d=batch.get('price_data_1d'), - btc_data_1m=batch.get('btc_data_1m'), - cob_data=batch.get('cob_data'), # Use .get() to handle missing key - tech_data=batch.get('tech_data'), - market_data=batch.get('market_data'), - position_state=batch.get('position_state'), - price_data=batch.get('price_data') # Legacy fallback + price_data_1s=batch_on_device.get('price_data_1s'), + price_data_1m=batch_on_device.get('price_data_1m'), + price_data_1h=batch_on_device.get('price_data_1h'), + price_data_1d=batch_on_device.get('price_data_1d'), + btc_data_1m=batch_on_device.get('btc_data_1m'), + cob_data=batch_on_device.get('cob_data'), # Use .get() to handle missing key + tech_data=batch_on_device.get('tech_data'), + market_data=batch_on_device.get('market_data'), + position_state=batch_on_device.get('position_state'), + price_data=batch_on_device.get('price_data') # Legacy fallback ) - # Calculate losses - action_loss = self.action_criterion(outputs['action_logits'], batch['actions']) + # Calculate losses (use batch_on_device for consistency) + action_loss = self.action_criterion(outputs['action_logits'], batch_on_device['actions']) # FIXED: Ensure shapes match for MSELoss price_pred = outputs['price_prediction'] - price_target = batch['future_prices'] + price_target = batch_on_device['future_prices'] # Both should be [batch, 1], but ensure they match if price_pred.shape != price_target.shape: @@ -1310,14 +1409,14 @@ class TradingTransformerTrainer: # NEW: Trend analysis loss (if trend_target provided) trend_loss = torch.tensor(0.0, device=self.device) - if 'trend_target' in batch and 'trend_analysis' in outputs: + if 'trend_target' in batch_on_device and 'trend_analysis' in outputs: trend_pred = torch.cat([ outputs['trend_analysis']['angle_radians'], outputs['trend_analysis']['steepness'], outputs['trend_analysis']['direction'] ], dim=1) # [batch, 3] - trend_target = batch['trend_target'] + trend_target = batch_on_device['trend_target'] if trend_pred.shape == trend_target.shape: trend_loss = self.price_criterion(trend_pred, trend_target) logger.debug(f"Trend loss: {trend_loss.item():.6f} (pred={trend_pred[0].tolist()}, target={trend_target[0].tolist()})") @@ -1333,7 +1432,7 @@ class TradingTransformerTrainer: # Get normalization parameters if available # norm_params may be a dict or a list of dicts (one per sample in batch) - norm_params_raw = batch.get('norm_params', {}) + norm_params_raw = batch_on_device.get('norm_params', {}) if isinstance(norm_params_raw, list) and len(norm_params_raw) > 0: # If it's a list, use the first one (batch size is typically 1) norm_params = norm_params_raw[0] @@ -1344,9 +1443,9 @@ class TradingTransformerTrainer: for tf in ['1s', '1m', '1h', '1d']: future_key = f'future_candle_{tf}' - if tf in outputs['next_candles'] and future_key in batch: + if tf in outputs['next_candles'] and future_key in batch_on_device: pred_candle = outputs['next_candles'][tf] # [batch, 5] - predicted OHLCV (normalized) - target_candle = batch[future_key] # [batch, 5] - actual OHLCV (normalized) + target_candle = batch_on_device[future_key] # [batch, 5] - actual OHLCV (normalized) if target_candle is not None and pred_candle.shape == target_candle.shape: # MSE loss on normalized values (used for backprop) @@ -1386,10 +1485,10 @@ class TradingTransformerTrainer: total_loss = total_loss / 5.0 # Add confidence loss if available - if 'confidence' in outputs and 'trade_success' in batch: + if 'confidence' in outputs and 'trade_success' in batch_on_device: # Both tensors should have shape [batch_size, 1] for BCELoss confidence_pred = outputs['confidence'] - trade_target = batch['trade_success'].float() + trade_target = batch_on_device['trade_success'].float() # FIXED: Ensure both are 2D tensors [batch_size, 1] # Handle different input shapes robustly @@ -1471,6 +1570,31 @@ class TradingTransformerTrainer: # Note: We need to recompute loss and backward pass, but for now just skip this step logger.warning("Skipping optimizer step after reset - gradients need to be recomputed") # Don't raise - allow training to continue with next batch + except RuntimeError as gpu_error: + # Check if it's a GPU-related error and fallback to CPU if configured + if "cuda" in str(gpu_error).lower() or "gpu" in str(gpu_error).lower(): + logger.error(f"GPU error during optimizer step: {gpu_error}") + # Try to fallback to CPU if configured + try: + from core.config import get_config + config = get_config() + fallback_to_cpu = config._config.get('gpu', {}).get('fallback_to_cpu', True) + if fallback_to_cpu and self.device.type == 'cuda': + logger.warning("Falling back to CPU due to GPU errors") + self.device = torch.device('cpu') + self.model.to(self.device) + # Recreate optimizer for CPU + self.optimizer = torch.optim.AdamW( + self.model.parameters(), + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay + ) + logger.info("Model moved to CPU, training will continue on CPU") + # Skip this step, continue with next batch + return result + except Exception as fallback_error: + logger.error(f"Failed to fallback to CPU: {fallback_error}") + raise self.scheduler.step() @@ -1486,12 +1610,12 @@ class TradingTransformerTrainer: if 'next_candles' in outputs: # Use 1s or 1m timeframe as primary metric (try 1s first) - if '1s' in outputs['next_candles'] and 'future_candle_1s' in batch: + if '1s' in outputs['next_candles'] and 'future_candle_1s' in batch_on_device: pred_candle = outputs['next_candles']['1s'] # [batch, 5] - actual_candle = batch['future_candle_1s'] # [batch, 5] - elif '1m' in outputs['next_candles'] and 'future_candle_1m' in batch: + actual_candle = batch_on_device['future_candle_1s'] # [batch, 5] + elif '1m' in outputs['next_candles'] and 'future_candle_1m' in batch_on_device: pred_candle = outputs['next_candles']['1m'] # [batch, 5] - actual_candle = batch['future_candle_1m'] # [batch, 5] + actual_candle = batch_on_device['future_candle_1m'] # [batch, 5] else: pred_candle = None actual_candle = None @@ -1521,12 +1645,12 @@ class TradingTransformerTrainer: # SECONDARY: Trend vector prediction accuracy trend_accuracy = 0.0 - if 'trend_analysis' in outputs and 'trend_target' in batch: + if 'trend_analysis' in outputs and 'trend_target' in batch_on_device: pred_angle = outputs['trend_analysis']['angle_radians'] pred_steepness = outputs['trend_analysis']['steepness'] - actual_angle = batch['trend_target'][:, 0:1] - actual_steepness = batch['trend_target'][:, 1:2] + actual_angle = batch_on_device['trend_target'][:, 0:1] + actual_steepness = batch_on_device['trend_target'][:, 1:2] # Angle error (degrees) angle_error_rad = torch.abs(pred_angle - actual_angle) @@ -1541,7 +1665,7 @@ class TradingTransformerTrainer: # LEGACY: Action accuracy (for comparison) action_predictions = torch.argmax(outputs['action_logits'], dim=-1) - action_accuracy = (action_predictions == batch['actions']).float().mean().item() + action_accuracy = (action_predictions == batch_on_device['actions']).float().mean().item() # Extract values and delete tensors to free memory result = { diff --git a/config.yaml b/config.yaml index 628b0c9..f607235 100644 --- a/config.yaml +++ b/config.yaml @@ -253,9 +253,11 @@ logs_dir: "logs" # GPU/Performance gpu: - enabled: true + enabled: false # TEMPORARILY DISABLED - forcing CPU mode to test if GPU errors persist + device: "cpu" # "auto", "cuda", "cpu" - FORCED TO CPU for testing memory_fraction: 0.8 # Use 80% of GPU memory allow_growth: true # Allow dynamic memory allocation + fallback_to_cpu: true # Fallback to CPU if GPU operations fail # Monitoring and Alerting monitoring: diff --git a/run_experimental_gpu.sh b/run_experimental_gpu.sh index eb5fb01..c9f208d 100644 --- a/run_experimental_gpu.sh +++ b/run_experimental_gpu.sh @@ -3,7 +3,7 @@ # This tells ROCm to treat gfx1151 as gfx1100 export HSA_OVERRIDE_GFX_VERSION=11.0.0 export AMD_SERIALIZE_KERNEL=3 # Enable debugging -# DISABLED: export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 # Was causing inplace operation errors +export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=0 # DISABLED - was causing inplace operation errors cd /mnt/shared/DEV/repos/d-popov.com/gogo2 source venv/bin/activate python ANNOTATE/web/app.py "$@" diff --git a/start_with_gpu.sh b/start_with_gpu.sh index e0610a0..199f63d 100644 --- a/start_with_gpu.sh +++ b/start_with_gpu.sh @@ -7,9 +7,8 @@ export HSA_OVERRIDE_GFX_VERSION=11.0.0 # Activate virtual environment source venv/bin/activate -# DISABLED: Experimental Flash Efficient attention for AMD GPU -# export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 -# This was causing inplace operation errors during backward pass +# Experimental Flash Efficient attention for AMD GPU (DISABLED - was causing inplace operation errors) +export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=0 echo "GPU Compatibility: HSA_OVERRIDE_GFX_VERSION=11.0.0" echo "Experimental Features: DISABLED (was causing gradient computation errors)"