diff --git a/core/orchestrator.py b/core/orchestrator.py index 9140576..1e78086 100644 --- a/core/orchestrator.py +++ b/core/orchestrator.py @@ -1646,16 +1646,17 @@ class TradingOrchestrator: prediction_price = historical_data['close'].iloc[-1] # Simplified price_change_pct = (current_price - prediction_price) / prediction_price * 100 - # Determine if prediction was correct + # Enhanced reward system based on prediction confidence and price movement magnitude predicted_action = prediction['action'] - was_correct = False + prediction_confidence = prediction.get('confidence', 0.5) # Default to 0.5 if missing - if predicted_action == 'BUY' and price_change_pct > 0.1: # Price went up - was_correct = True - elif predicted_action == 'SELL' and price_change_pct < -0.1: # Price went down - was_correct = True - elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1: # Price stayed stable - was_correct = True + # Calculate sophisticated reward based on multiple factors + reward, was_correct = self._calculate_sophisticated_reward( + predicted_action, + prediction_confidence, + price_change_pct, + time_diff + ) # Update model performance tracking if model_name not in self.model_performance: @@ -1670,24 +1671,102 @@ class TradingOrchestrator: self.model_performance[model_name]['total'] ) - # Train the specific model based on outcome - await self._train_model_on_outcome(record, was_correct, price_change_pct) + # Train the specific model based on sophisticated outcome + await self._train_model_on_outcome(record, was_correct, price_change_pct, reward) logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} " - f"({prediction['action']}, {price_change_pct:.2f}% change)") + f"({prediction['action']}, {price_change_pct:.2f}% change, " + f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})") except Exception as e: logger.error(f"Error evaluating and training on record: {e}") - async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float): - """Train specific model based on prediction outcome""" + def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float, + price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]: + """ + Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude + + Args: + predicted_action: The predicted action ('BUY', 'SELL', 'HOLD') + prediction_confidence: Model's confidence in the prediction (0.0 to 1.0) + price_change_pct: Actual price change percentage + time_diff_minutes: Time elapsed since prediction + + Returns: + tuple: (reward, was_correct) + """ + try: + # Base thresholds for determining correctness + movement_threshold = 0.1 # 0.1% minimum movement to consider significant + + # Determine if prediction was directionally correct + was_correct = False + directional_accuracy = 0.0 + + if predicted_action == 'BUY': + was_correct = price_change_pct > movement_threshold + directional_accuracy = max(0, price_change_pct) # Positive for upward movement + elif predicted_action == 'SELL': + was_correct = price_change_pct < -movement_threshold + directional_accuracy = max(0, -price_change_pct) # Positive for downward movement + elif predicted_action == 'HOLD': + was_correct = abs(price_change_pct) < movement_threshold + directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) # Positive for stability + + # Calculate magnitude-based multiplier (higher rewards for larger correct movements) + magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0) # Cap at 3x for 6% moves + + # Calculate confidence-based reward adjustment + if was_correct: + # Reward confident correct predictions more, penalize unconfident correct predictions less + confidence_multiplier = 0.5 + (prediction_confidence * 1.5) # Range: 0.5 to 2.0 + base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier + + # Bonus for high-confidence correct predictions with large movements + if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0: + base_reward *= 1.5 # 50% bonus for very confident + large movement + + else: + # Penalize incorrect predictions more severely if they were confident + confidence_penalty = 0.5 + (prediction_confidence * 1.5) # Higher confidence = higher penalty + base_penalty = abs(price_change_pct) * confidence_penalty + + # Extra penalty for very confident wrong predictions + if prediction_confidence > 0.8: + base_penalty *= 2.0 # Double penalty for overconfident wrong predictions + + base_reward = -base_penalty + + # Time decay factor (predictions should be evaluated quickly) + time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0)) # Decay over 1 hour, min 10% + + # Final reward calculation + final_reward = base_reward * time_decay + + # Clamp reward to reasonable range + final_reward = max(-5.0, min(5.0, final_reward)) + + return final_reward, was_correct + + except Exception as e: + logger.error(f"Error calculating sophisticated reward: {e}") + # Fallback to simple reward + simple_correct = ( + (predicted_action == 'BUY' and price_change_pct > 0.1) or + (predicted_action == 'SELL' and price_change_pct < -0.1) or + (predicted_action == 'HOLD' and abs(price_change_pct) < 0.1) + ) + return (1.0 if simple_correct else -0.5, simple_correct) + + async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None): + """Train specific model based on prediction outcome with sophisticated reward system""" try: model_name = record['model_name'] model_input = record['model_input'] prediction = record['prediction'] - # Create training signal based on outcome - reward = 1.0 if was_correct else -0.5 + # Use sophisticated reward if provided, otherwise fallback to simple reward + reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5) # Train RL models if 'dqn' in model_name.lower() and self.rl_agent: @@ -1700,14 +1779,14 @@ class TradingOrchestrator: next_state=model_input, # Simplified done=True ) - logger.debug(f"Added RL training experience: reward={reward}") + logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)") # Train CNN models using adapter elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter: # Use the adapter's add_training_sample method actual_action = prediction['action'] self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward) - logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}") + logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)") # Trigger training if we have enough samples if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size: