improved reward/penalty

2025-07-25 14:15:43 +03:00
parent 26eeb9b35b
commit 130a52fb9b
1 changed files with 96 additions and 17 deletions
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -1646,16 +1646,17 @@ class TradingOrchestrator:
            prediction_price = historical_data['close'].iloc[-1]  # Simplified
            price_change_pct = (current_price - prediction_price) / prediction_price * 100
-            # Determine if prediction was correct
+            # Enhanced reward system based on prediction confidence and price movement magnitude
            predicted_action = prediction['action']
-            was_correct = False
+            prediction_confidence = prediction.get('confidence', 0.5)  # Default to 0.5 if missing
-            if predicted_action == 'BUY' and price_change_pct > 0.1:  # Price went up
+            # Calculate sophisticated reward based on multiple factors
-                was_correct = True
+            reward, was_correct = self._calculate_sophisticated_reward(
-            elif predicted_action == 'SELL' and price_change_pct < -0.1:  # Price went down
+                predicted_action, 
-                was_correct = True
+                prediction_confidence, 
-            elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1:  # Price stayed stable
+                price_change_pct, 
-                was_correct = True
+                time_diff
            )
            # Update model performance tracking
            if model_name not in self.model_performance:
@@ -1670,24 +1671,102 @@ class TradingOrchestrator:
                self.model_performance[model_name]['total']
            )
-            # Train the specific model based on outcome
+            # Train the specific model based on sophisticated outcome
-            await self._train_model_on_outcome(record, was_correct, price_change_pct)
+            await self._train_model_on_outcome(record, was_correct, price_change_pct, reward)
            logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} "
-                        f"({prediction['action']}, {price_change_pct:.2f}% change)")
+                        f"({prediction['action']}, {price_change_pct:.2f}% change, "
                        f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})")
        except Exception as e:
            logger.error(f"Error evaluating and training on record: {e}")
-    async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float):
+    def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float, 
-        """Train specific model based on prediction outcome"""
+                                      price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]:
        """
        Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
        Args:
            predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
            prediction_confidence: Model's confidence in the prediction (0.0 to 1.0)
            price_change_pct: Actual price change percentage
            time_diff_minutes: Time elapsed since prediction
        Returns:
            tuple: (reward, was_correct)
        """
        try:
            # Base thresholds for determining correctness
            movement_threshold = 0.1  # 0.1% minimum movement to consider significant
            # Determine if prediction was directionally correct
            was_correct = False
            directional_accuracy = 0.0
            if predicted_action == 'BUY':
                was_correct = price_change_pct > movement_threshold
                directional_accuracy = max(0, price_change_pct)  # Positive for upward movement
            elif predicted_action == 'SELL':
                was_correct = price_change_pct < -movement_threshold
                directional_accuracy = max(0, -price_change_pct)  # Positive for downward movement
            elif predicted_action == 'HOLD':
                was_correct = abs(price_change_pct) < movement_threshold
                directional_accuracy = max(0, movement_threshold - abs(price_change_pct))  # Positive for stability
            # Calculate magnitude-based multiplier (higher rewards for larger correct movements)
            magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0)  # Cap at 3x for 6% moves
            # Calculate confidence-based reward adjustment
            if was_correct:
                # Reward confident correct predictions more, penalize unconfident correct predictions less
                confidence_multiplier = 0.5 + (prediction_confidence * 1.5)  # Range: 0.5 to 2.0
                base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier
                # Bonus for high-confidence correct predictions with large movements
                if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0:
                    base_reward *= 1.5  # 50% bonus for very confident + large movement
            else:
                # Penalize incorrect predictions more severely if they were confident
                confidence_penalty = 0.5 + (prediction_confidence * 1.5)  # Higher confidence = higher penalty
                base_penalty = abs(price_change_pct) * confidence_penalty
                # Extra penalty for very confident wrong predictions
                if prediction_confidence > 0.8:
                    base_penalty *= 2.0  # Double penalty for overconfident wrong predictions
                base_reward = -base_penalty
            # Time decay factor (predictions should be evaluated quickly)
            time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0))  # Decay over 1 hour, min 10%
            # Final reward calculation
            final_reward = base_reward * time_decay
            # Clamp reward to reasonable range
            final_reward = max(-5.0, min(5.0, final_reward))
            return final_reward, was_correct
        except Exception as e:
            logger.error(f"Error calculating sophisticated reward: {e}")
            # Fallback to simple reward
            simple_correct = (
                (predicted_action == 'BUY' and price_change_pct > 0.1) or
                (predicted_action == 'SELL' and price_change_pct < -0.1) or
                (predicted_action == 'HOLD' and abs(price_change_pct) < 0.1)
            )
            return (1.0 if simple_correct else -0.5, simple_correct)
    async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None):
        """Train specific model based on prediction outcome with sophisticated reward system"""
        try:
            model_name = record['model_name']
            model_input = record['model_input']
            prediction = record['prediction']
-            # Create training signal based on outcome
+            # Use sophisticated reward if provided, otherwise fallback to simple reward
-            reward = 1.0 if was_correct else -0.5
+            reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5)
            # Train RL models
            if 'dqn' in model_name.lower() and self.rl_agent:
@@ -1700,14 +1779,14 @@ class TradingOrchestrator:
                        next_state=model_input,  # Simplified
                        done=True
                    )
-                    logger.debug(f"Added RL training experience: reward={reward}")
+                    logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)")
            # Train CNN models using adapter
            elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
                # Use the adapter's add_training_sample method
                actual_action = prediction['action']
                self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
-                logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}")
+                logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)")
                # Trigger training if we have enough samples
                if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size: