wip old MISC fix

This commit is contained in:
Dobromir Popov
2025-12-08 16:56:37 +02:00
parent 81e7e6bfe6
commit 03888b6200
5 changed files with 719 additions and 343 deletions

View File

@@ -318,27 +318,62 @@ class TradingOrchestrator:
# Initialize confidence threshold
self.confidence_threshold = self.config.get('confidence_threshold', 0.6)
# Determine the device to use (GPU if available, else CPU)
# Initialize device - force CPU mode to avoid CUDA errors
if torch.cuda.is_available():
try:
# Test CUDA availability with actual Linear layer operation
# This catches architecture-specific issues like gfx1151 incompatibility
test_tensor = torch.randn(2, 10).cuda()
test_linear = torch.nn.Linear(10, 5).cuda()
test_result = test_linear(test_tensor)
logger.info(f"GPU compatibility test passed: {torch.cuda.get_device_name(0)}")
self.device = torch.device("cuda")
logger.info("CUDA/ROCm device initialized successfully")
except Exception as e:
logger.warning(f"CUDA/ROCm initialization failed: {e}")
logger.warning("GPU architecture may not be supported - falling back to CPU")
logger.warning("This is common with newer AMD GPUs (gfx1151+) that require specific PyTorch builds")
self.device = torch.device("cpu")
else:
self.device = torch.device("cpu")
# Determine the device to use from config.yaml
self.device = self._get_device_from_config()
logger.info(f"Using device: {self.device}")
def _get_device_from_config(self) -> torch.device:
"""Get device from config.yaml or auto-detect"""
try:
gpu_config = self.config._config.get('gpu', {})
device_setting = gpu_config.get('device', 'auto')
fallback_to_cpu = gpu_config.get('fallback_to_cpu', True)
gpu_enabled = gpu_config.get('enabled', True)
# If GPU is disabled in config, use CPU
if not gpu_enabled:
logger.info("GPU disabled in config.yaml, using CPU")
return torch.device('cpu')
# Handle device selection
if device_setting == 'cpu':
logger.info("Device set to CPU in config.yaml")
return torch.device('cpu')
elif device_setting == 'cuda' or device_setting == 'auto':
# Try GPU first with compatibility test
if torch.cuda.is_available():
try:
# Test CUDA availability with actual Linear layer operation
# This catches architecture-specific issues like gfx1151 incompatibility
test_tensor = torch.randn(2, 10).cuda()
test_linear = torch.nn.Linear(10, 5).cuda()
test_result = test_linear(test_tensor)
logger.info(f"GPU compatibility test passed: {torch.cuda.get_device_name(0)}")
logger.info("CUDA/ROCm device initialized successfully")
return torch.device("cuda")
except Exception as e:
logger.warning(f"CUDA/ROCm initialization failed: {e}")
logger.warning("GPU architecture may not be supported - falling back to CPU")
logger.warning("This is common with newer AMD GPUs (gfx1151+) that require specific PyTorch builds")
if fallback_to_cpu:
return torch.device("cpu")
else:
raise RuntimeError("CUDA not available and fallback_to_cpu is False")
else:
if fallback_to_cpu:
logger.warning("CUDA not available, falling back to CPU")
return torch.device('cpu')
else:
raise RuntimeError("CUDA not available and fallback_to_cpu is False")
else:
logger.warning(f"Unknown device setting '{device_setting}', using auto-detection")
return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
except Exception as e:
logger.warning(f"Error reading device config: {e}, using auto-detection")
# Fallback to auto-detection
return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Canonical model name aliases to eliminate ambiguity across UI/DB/FS
# Canonical → accepted aliases (internal/legacy)