improved reward/penalty

2025-07-25 14:15:43 +03:00
parent 26eeb9b35b
commit 130a52fb9b
1 changed files with 96 additions and 17 deletions
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -1646,16 +1646,17 @@ class TradingOrchestrator:
            prediction_price = historical_data['close'].iloc[-1]  # Simplified
            price_change_pct = (current_price - prediction_price) / prediction_price * 100
            
-            # Determine if prediction was correct
+            # Enhanced reward system based on prediction confidence and price movement magnitude
            predicted_action = prediction['action']
-            was_correct = False
+            prediction_confidence = prediction.get('confidence', 0.5)  # Default to 0.5 if missing
            
-            if predicted_action == 'BUY' and price_change_pct > 0.1:  # Price went up
-                was_correct = True
-            elif predicted_action == 'SELL' and price_change_pct < -0.1:  # Price went down
-                was_correct = True
-            elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1:  # Price stayed stable
-                was_correct = True
+            # Calculate sophisticated reward based on multiple factors
+            reward, was_correct = self._calculate_sophisticated_reward(
+                predicted_action, 
+                prediction_confidence, 
+                price_change_pct, 
+                time_diff
+            )
            
            # Update model performance tracking
            if model_name not in self.model_performance:
@@ -1670,24 +1671,102 @@ class TradingOrchestrator:
                self.model_performance[model_name]['total']
            )
            
-            # Train the specific model based on outcome
-            await self._train_model_on_outcome(record, was_correct, price_change_pct)
+            # Train the specific model based on sophisticated outcome
+            await self._train_model_on_outcome(record, was_correct, price_change_pct, reward)
            
            logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} "
-                        f"({prediction['action']}, {price_change_pct:.2f}% change)")
+                        f"({prediction['action']}, {price_change_pct:.2f}% change, "
+                        f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})")
            
        except Exception as e:
            logger.error(f"Error evaluating and training on record: {e}")
    
-    async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float):
-        """Train specific model based on prediction outcome"""
+    def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float, 
+                                      price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]:
+        """
+        Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
+        
+        Args:
+            predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
+            prediction_confidence: Model's confidence in the prediction (0.0 to 1.0)
+            price_change_pct: Actual price change percentage
+            time_diff_minutes: Time elapsed since prediction
+        
+        Returns:
+            tuple: (reward, was_correct)
+        """
+        try:
+            # Base thresholds for determining correctness
+            movement_threshold = 0.1  # 0.1% minimum movement to consider significant
+            
+            # Determine if prediction was directionally correct
+            was_correct = False
+            directional_accuracy = 0.0
+            
+            if predicted_action == 'BUY':
+                was_correct = price_change_pct > movement_threshold
+                directional_accuracy = max(0, price_change_pct)  # Positive for upward movement
+            elif predicted_action == 'SELL':
+                was_correct = price_change_pct < -movement_threshold
+                directional_accuracy = max(0, -price_change_pct)  # Positive for downward movement
+            elif predicted_action == 'HOLD':
+                was_correct = abs(price_change_pct) < movement_threshold
+                directional_accuracy = max(0, movement_threshold - abs(price_change_pct))  # Positive for stability
+            
+            # Calculate magnitude-based multiplier (higher rewards for larger correct movements)
+            magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0)  # Cap at 3x for 6% moves
+            
+            # Calculate confidence-based reward adjustment
+            if was_correct:
+                # Reward confident correct predictions more, penalize unconfident correct predictions less
+                confidence_multiplier = 0.5 + (prediction_confidence * 1.5)  # Range: 0.5 to 2.0
+                base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier
+                
+                # Bonus for high-confidence correct predictions with large movements
+                if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0:
+                    base_reward *= 1.5  # 50% bonus for very confident + large movement
+                    
+            else:
+                # Penalize incorrect predictions more severely if they were confident
+                confidence_penalty = 0.5 + (prediction_confidence * 1.5)  # Higher confidence = higher penalty
+                base_penalty = abs(price_change_pct) * confidence_penalty
+                
+                # Extra penalty for very confident wrong predictions
+                if prediction_confidence > 0.8:
+                    base_penalty *= 2.0  # Double penalty for overconfident wrong predictions
+                
+                base_reward = -base_penalty
+            
+            # Time decay factor (predictions should be evaluated quickly)
+            time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0))  # Decay over 1 hour, min 10%
+            
+            # Final reward calculation
+            final_reward = base_reward * time_decay
+            
+            # Clamp reward to reasonable range
+            final_reward = max(-5.0, min(5.0, final_reward))
+            
+            return final_reward, was_correct
+            
+        except Exception as e:
+            logger.error(f"Error calculating sophisticated reward: {e}")
+            # Fallback to simple reward
+            simple_correct = (
+                (predicted_action == 'BUY' and price_change_pct > 0.1) or
+                (predicted_action == 'SELL' and price_change_pct < -0.1) or
+                (predicted_action == 'HOLD' and abs(price_change_pct) < 0.1)
+            )
+            return (1.0 if simple_correct else -0.5, simple_correct)
+    
+    async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None):
+        """Train specific model based on prediction outcome with sophisticated reward system"""
        try:
            model_name = record['model_name']
            model_input = record['model_input']
            prediction = record['prediction']
            
-            # Create training signal based on outcome
-            reward = 1.0 if was_correct else -0.5
+            # Use sophisticated reward if provided, otherwise fallback to simple reward
+            reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5)
            
            # Train RL models
            if 'dqn' in model_name.lower() and self.rl_agent:
@@ -1700,14 +1779,14 @@ class TradingOrchestrator:
                        next_state=model_input,  # Simplified
                        done=True
                    )
-                    logger.debug(f"Added RL training experience: reward={reward}")
+                    logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)")
            
            # Train CNN models using adapter
            elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
                # Use the adapter's add_training_sample method
                actual_action = prediction['action']
                self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
-                logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}")
+                logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)")
                
                # Trigger training if we have enough samples
                if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size: