gpu issues
This commit is contained in:
12
.vscode/launch.json
vendored
12
.vscode/launch.json
vendored
@@ -17,7 +17,6 @@
|
|||||||
"ENABLE_REALTIME_CHARTS": "1",
|
"ENABLE_REALTIME_CHARTS": "1",
|
||||||
"ENABLE_NN_MODELS": "1",
|
"ENABLE_NN_MODELS": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"preLaunchTask": "Kill Stale Processes"
|
"preLaunchTask": "Kill Stale Processes"
|
||||||
},
|
},
|
||||||
@@ -39,7 +38,6 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -61,7 +59,6 @@
|
|||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"CUDA_VISIBLE_DEVICES": "0",
|
"CUDA_VISIBLE_DEVICES": "0",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -84,7 +81,6 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -97,7 +93,6 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -112,7 +107,6 @@
|
|||||||
"FLASK_ENV": "development",
|
"FLASK_ENV": "development",
|
||||||
"FLASK_DEBUG": "1",
|
"FLASK_DEBUG": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
"preLaunchTask": "Kill Stale Processes"
|
"preLaunchTask": "Kill Stale Processes"
|
||||||
@@ -129,7 +123,6 @@
|
|||||||
"COB_BTC_BUCKET_SIZE": "10",
|
"COB_BTC_BUCKET_SIZE": "10",
|
||||||
"COB_ETH_BUCKET_SIZE": "1",
|
"COB_ETH_BUCKET_SIZE": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"preLaunchTask": "Kill Stale Processes"
|
"preLaunchTask": "Kill Stale Processes"
|
||||||
},
|
},
|
||||||
@@ -146,7 +139,6 @@
|
|||||||
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:256",
|
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:256",
|
||||||
"ENABLE_REALTIME_RL": "1",
|
"ENABLE_REALTIME_RL": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"preLaunchTask": "Kill Stale Processes"
|
"preLaunchTask": "Kill Stale Processes"
|
||||||
},
|
},
|
||||||
@@ -165,7 +157,6 @@
|
|||||||
"COB_BTC_BUCKET_SIZE": "10",
|
"COB_BTC_BUCKET_SIZE": "10",
|
||||||
"COB_ETH_BUCKET_SIZE": "1",
|
"COB_ETH_BUCKET_SIZE": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"preLaunchTask": "Kill Stale Processes"
|
"preLaunchTask": "Kill Stale Processes"
|
||||||
},
|
},
|
||||||
@@ -179,7 +170,6 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -192,7 +182,6 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PYTHONUNBUFFERED": "1",
|
"PYTHONUNBUFFERED": "1",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -214,7 +203,6 @@
|
|||||||
"COBY_WEBSOCKET_PORT": "8081",
|
"COBY_WEBSOCKET_PORT": "8081",
|
||||||
"COBY_LOG_LEVEL": "DEBUG",
|
"COBY_LOG_LEVEL": "DEBUG",
|
||||||
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
"HSA_OVERRIDE_GFX_VERSION": "11.0.0",
|
||||||
"TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL": "1"
|
|
||||||
},
|
},
|
||||||
"preLaunchTask": "Kill Stale Processes",
|
"preLaunchTask": "Kill Stale Processes",
|
||||||
"presentation": {
|
"presentation": {
|
||||||
|
|||||||
@@ -1227,6 +1227,13 @@ class TradingTransformerTrainer:
|
|||||||
try:
|
try:
|
||||||
self.model.train()
|
self.model.train()
|
||||||
|
|
||||||
|
# Enable anomaly detection temporarily to debug inplace operation issues
|
||||||
|
# NOTE: This significantly slows down training (2-3x slower), use only for debugging
|
||||||
|
# Set to False once the issue is resolved
|
||||||
|
enable_anomaly_detection = False # Set to True to debug gradient issues
|
||||||
|
if enable_anomaly_detection:
|
||||||
|
torch.autograd.set_detect_anomaly(True)
|
||||||
|
|
||||||
# GRADIENT ACCUMULATION: Determine if this is an accumulation step
|
# GRADIENT ACCUMULATION: Determine if this is an accumulation step
|
||||||
# If gradient_accumulation_steps is set, use automatic accumulation
|
# If gradient_accumulation_steps is set, use automatic accumulation
|
||||||
# Otherwise, fall back to manual accumulate_gradients flag
|
# Otherwise, fall back to manual accumulate_gradients flag
|
||||||
@@ -1445,8 +1452,25 @@ class TradingTransformerTrainer:
|
|||||||
else:
|
else:
|
||||||
# Gradient clipping
|
# Gradient clipping
|
||||||
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
|
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
|
||||||
# Optimizer step
|
# Optimizer step with error handling
|
||||||
self.optimizer.step()
|
try:
|
||||||
|
self.optimizer.step()
|
||||||
|
except (KeyError, RuntimeError) as opt_error:
|
||||||
|
logger.error(f"Optimizer step failed: {opt_error}. Resetting optimizer state.")
|
||||||
|
# Zero gradients first to clear any stale gradients
|
||||||
|
self.optimizer.zero_grad(set_to_none=True)
|
||||||
|
# Reset optimizer to fix corrupted state
|
||||||
|
self.optimizer = torch.optim.AdamW(
|
||||||
|
self.model.parameters(),
|
||||||
|
lr=self.config.learning_rate,
|
||||||
|
weight_decay=self.config.weight_decay
|
||||||
|
)
|
||||||
|
# Zero gradients again after recreating optimizer
|
||||||
|
self.optimizer.zero_grad(set_to_none=True)
|
||||||
|
# Retry optimizer step with fresh state
|
||||||
|
# Note: We need to recompute loss and backward pass, but for now just skip this step
|
||||||
|
logger.warning("Skipping optimizer step after reset - gradients need to be recomputed")
|
||||||
|
# Don't raise - allow training to continue with next batch
|
||||||
|
|
||||||
self.scheduler.step()
|
self.scheduler.step()
|
||||||
|
|
||||||
@@ -1697,30 +1721,55 @@ class TradingTransformerTrainer:
|
|||||||
logger.warning(f"Error loading model state dict: {e}, continuing with partial load")
|
logger.warning(f"Error loading model state dict: {e}, continuing with partial load")
|
||||||
|
|
||||||
# Load optimizer state (handle mismatched states gracefully)
|
# Load optimizer state (handle mismatched states gracefully)
|
||||||
|
# IMPORTANT: Always recreate optimizer if there's any issue to avoid corrupted state
|
||||||
|
optimizer_state_loaded = False
|
||||||
try:
|
try:
|
||||||
optimizer_state = checkpoint.get('optimizer_state_dict')
|
optimizer_state = checkpoint.get('optimizer_state_dict')
|
||||||
if optimizer_state:
|
if optimizer_state:
|
||||||
try:
|
# Validate optimizer state before loading
|
||||||
# Try to load optimizer state
|
# Check if state dict has the expected structure
|
||||||
self.optimizer.load_state_dict(optimizer_state)
|
if 'state' in optimizer_state and 'param_groups' in optimizer_state:
|
||||||
except (KeyError, ValueError, RuntimeError) as e:
|
# Count parameters in saved state vs current model
|
||||||
logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.")
|
saved_param_count = len(optimizer_state.get('state', {}))
|
||||||
# Recreate optimizer (same pattern as __init__)
|
current_param_count = sum(1 for _ in self.model.parameters() if _.requires_grad)
|
||||||
self.optimizer = torch.optim.AdamW(
|
|
||||||
self.model.parameters(),
|
if saved_param_count == current_param_count:
|
||||||
lr=self.config.learning_rate,
|
try:
|
||||||
weight_decay=self.config.weight_decay
|
# Try to load optimizer state
|
||||||
)
|
self.optimizer.load_state_dict(optimizer_state)
|
||||||
|
optimizer_state_loaded = True
|
||||||
|
logger.info("Optimizer state loaded successfully")
|
||||||
|
except (KeyError, ValueError, RuntimeError, TypeError) as e:
|
||||||
|
logger.warning(f"Error loading optimizer state: {e}. State will be reset.")
|
||||||
|
optimizer_state_loaded = False
|
||||||
|
else:
|
||||||
|
logger.warning(f"Optimizer state mismatch: {saved_param_count} saved params vs {current_param_count} current params. Resetting optimizer.")
|
||||||
|
optimizer_state_loaded = False
|
||||||
|
else:
|
||||||
|
logger.warning("Invalid optimizer state structure in checkpoint. Resetting optimizer.")
|
||||||
|
optimizer_state_loaded = False
|
||||||
else:
|
else:
|
||||||
logger.warning("No optimizer state found in checkpoint. Using fresh optimizer.")
|
logger.info("No optimizer state found in checkpoint. Using fresh optimizer.")
|
||||||
|
optimizer_state_loaded = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.")
|
logger.warning(f"Error loading optimizer state: {e}. Resetting optimizer.")
|
||||||
# Recreate optimizer (same pattern as __init__)
|
optimizer_state_loaded = False
|
||||||
|
|
||||||
|
# Always recreate optimizer if state loading failed
|
||||||
|
if not optimizer_state_loaded:
|
||||||
|
logger.info("Creating fresh optimizer (checkpoint state was invalid or missing)")
|
||||||
self.optimizer = torch.optim.AdamW(
|
self.optimizer = torch.optim.AdamW(
|
||||||
self.model.parameters(),
|
self.model.parameters(),
|
||||||
lr=self.config.learning_rate,
|
lr=self.config.learning_rate,
|
||||||
weight_decay=self.config.weight_decay
|
weight_decay=self.config.weight_decay
|
||||||
)
|
)
|
||||||
|
# Also recreate scheduler to match
|
||||||
|
self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
|
||||||
|
self.optimizer,
|
||||||
|
max_lr=self.config.learning_rate,
|
||||||
|
total_steps=10000,
|
||||||
|
pct_start=0.1
|
||||||
|
)
|
||||||
|
|
||||||
# Load scheduler state
|
# Load scheduler state
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# This tells ROCm to treat gfx1151 as gfx1100
|
# This tells ROCm to treat gfx1151 as gfx1100
|
||||||
export HSA_OVERRIDE_GFX_VERSION=11.0.0
|
export HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||||
export AMD_SERIALIZE_KERNEL=3 # Enable debugging
|
export AMD_SERIALIZE_KERNEL=3 # Enable debugging
|
||||||
export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 # Enable Flash Efficient attention
|
# DISABLED: export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 # Was causing inplace operation errors
|
||||||
cd /mnt/shared/DEV/repos/d-popov.com/gogo2
|
cd /mnt/shared/DEV/repos/d-popov.com/gogo2
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
python ANNOTATE/web/app.py "$@"
|
python ANNOTATE/web/app.py "$@"
|
||||||
|
|||||||
@@ -7,11 +7,12 @@ export HSA_OVERRIDE_GFX_VERSION=11.0.0
|
|||||||
# Activate virtual environment
|
# Activate virtual environment
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
|
|
||||||
# Enable experimental Flash Efficient attention for AMD GPU
|
# DISABLED: Experimental Flash Efficient attention for AMD GPU
|
||||||
export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
|
# export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
|
||||||
|
# This was causing inplace operation errors during backward pass
|
||||||
|
|
||||||
echo "GPU Compatibility: HSA_OVERRIDE_GFX_VERSION=11.0.0"
|
echo "GPU Compatibility: HSA_OVERRIDE_GFX_VERSION=11.0.0"
|
||||||
echo "Experimental Features: TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1"
|
echo "Experimental Features: DISABLED (was causing gradient computation errors)"
|
||||||
echo "Virtual environment: $(which python)"
|
echo "Virtual environment: $(which python)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Starting application..."
|
echo "Starting application..."
|
||||||
|
|||||||
Reference in New Issue
Block a user