improved reward/penalty

This commit is contained in:
Dobromir Popov
2025-07-25 14:15:43 +03:00
parent 26eeb9b35b
commit 130a52fb9b

View File

@ -1646,16 +1646,17 @@ class TradingOrchestrator:
prediction_price = historical_data['close'].iloc[-1] # Simplified
price_change_pct = (current_price - prediction_price) / prediction_price * 100
# Determine if prediction was correct
# Enhanced reward system based on prediction confidence and price movement magnitude
predicted_action = prediction['action']
was_correct = False
prediction_confidence = prediction.get('confidence', 0.5) # Default to 0.5 if missing
if predicted_action == 'BUY' and price_change_pct > 0.1: # Price went up
was_correct = True
elif predicted_action == 'SELL' and price_change_pct < -0.1: # Price went down
was_correct = True
elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1: # Price stayed stable
was_correct = True
# Calculate sophisticated reward based on multiple factors
reward, was_correct = self._calculate_sophisticated_reward(
predicted_action,
prediction_confidence,
price_change_pct,
time_diff
)
# Update model performance tracking
if model_name not in self.model_performance:
@ -1670,24 +1671,102 @@ class TradingOrchestrator:
self.model_performance[model_name]['total']
)
# Train the specific model based on outcome
await self._train_model_on_outcome(record, was_correct, price_change_pct)
# Train the specific model based on sophisticated outcome
await self._train_model_on_outcome(record, was_correct, price_change_pct, reward)
logger.debug(f"Evaluated {model_name} prediction: {'' if was_correct else ''} "
f"({prediction['action']}, {price_change_pct:.2f}% change)")
f"({prediction['action']}, {price_change_pct:.2f}% change, "
f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})")
except Exception as e:
logger.error(f"Error evaluating and training on record: {e}")
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float):
"""Train specific model based on prediction outcome"""
def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float,
price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]:
"""
Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
Args:
predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
prediction_confidence: Model's confidence in the prediction (0.0 to 1.0)
price_change_pct: Actual price change percentage
time_diff_minutes: Time elapsed since prediction
Returns:
tuple: (reward, was_correct)
"""
try:
# Base thresholds for determining correctness
movement_threshold = 0.1 # 0.1% minimum movement to consider significant
# Determine if prediction was directionally correct
was_correct = False
directional_accuracy = 0.0
if predicted_action == 'BUY':
was_correct = price_change_pct > movement_threshold
directional_accuracy = max(0, price_change_pct) # Positive for upward movement
elif predicted_action == 'SELL':
was_correct = price_change_pct < -movement_threshold
directional_accuracy = max(0, -price_change_pct) # Positive for downward movement
elif predicted_action == 'HOLD':
was_correct = abs(price_change_pct) < movement_threshold
directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) # Positive for stability
# Calculate magnitude-based multiplier (higher rewards for larger correct movements)
magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0) # Cap at 3x for 6% moves
# Calculate confidence-based reward adjustment
if was_correct:
# Reward confident correct predictions more, penalize unconfident correct predictions less
confidence_multiplier = 0.5 + (prediction_confidence * 1.5) # Range: 0.5 to 2.0
base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier
# Bonus for high-confidence correct predictions with large movements
if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0:
base_reward *= 1.5 # 50% bonus for very confident + large movement
else:
# Penalize incorrect predictions more severely if they were confident
confidence_penalty = 0.5 + (prediction_confidence * 1.5) # Higher confidence = higher penalty
base_penalty = abs(price_change_pct) * confidence_penalty
# Extra penalty for very confident wrong predictions
if prediction_confidence > 0.8:
base_penalty *= 2.0 # Double penalty for overconfident wrong predictions
base_reward = -base_penalty
# Time decay factor (predictions should be evaluated quickly)
time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0)) # Decay over 1 hour, min 10%
# Final reward calculation
final_reward = base_reward * time_decay
# Clamp reward to reasonable range
final_reward = max(-5.0, min(5.0, final_reward))
return final_reward, was_correct
except Exception as e:
logger.error(f"Error calculating sophisticated reward: {e}")
# Fallback to simple reward
simple_correct = (
(predicted_action == 'BUY' and price_change_pct > 0.1) or
(predicted_action == 'SELL' and price_change_pct < -0.1) or
(predicted_action == 'HOLD' and abs(price_change_pct) < 0.1)
)
return (1.0 if simple_correct else -0.5, simple_correct)
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None):
"""Train specific model based on prediction outcome with sophisticated reward system"""
try:
model_name = record['model_name']
model_input = record['model_input']
prediction = record['prediction']
# Create training signal based on outcome
reward = 1.0 if was_correct else -0.5
# Use sophisticated reward if provided, otherwise fallback to simple reward
reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5)
# Train RL models
if 'dqn' in model_name.lower() and self.rl_agent:
@ -1700,14 +1779,14 @@ class TradingOrchestrator:
next_state=model_input, # Simplified
done=True
)
logger.debug(f"Added RL training experience: reward={reward}")
logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)")
# Train CNN models using adapter
elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
# Use the adapter's add_training_sample method
actual_action = prediction['action']
self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}")
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)")
# Trigger training if we have enough samples
if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size: