improved reward/penalty
This commit is contained in:
@ -1646,16 +1646,17 @@ class TradingOrchestrator:
|
||||
prediction_price = historical_data['close'].iloc[-1] # Simplified
|
||||
price_change_pct = (current_price - prediction_price) / prediction_price * 100
|
||||
|
||||
# Determine if prediction was correct
|
||||
# Enhanced reward system based on prediction confidence and price movement magnitude
|
||||
predicted_action = prediction['action']
|
||||
was_correct = False
|
||||
prediction_confidence = prediction.get('confidence', 0.5) # Default to 0.5 if missing
|
||||
|
||||
if predicted_action == 'BUY' and price_change_pct > 0.1: # Price went up
|
||||
was_correct = True
|
||||
elif predicted_action == 'SELL' and price_change_pct < -0.1: # Price went down
|
||||
was_correct = True
|
||||
elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1: # Price stayed stable
|
||||
was_correct = True
|
||||
# Calculate sophisticated reward based on multiple factors
|
||||
reward, was_correct = self._calculate_sophisticated_reward(
|
||||
predicted_action,
|
||||
prediction_confidence,
|
||||
price_change_pct,
|
||||
time_diff
|
||||
)
|
||||
|
||||
# Update model performance tracking
|
||||
if model_name not in self.model_performance:
|
||||
@ -1670,24 +1671,102 @@ class TradingOrchestrator:
|
||||
self.model_performance[model_name]['total']
|
||||
)
|
||||
|
||||
# Train the specific model based on outcome
|
||||
await self._train_model_on_outcome(record, was_correct, price_change_pct)
|
||||
# Train the specific model based on sophisticated outcome
|
||||
await self._train_model_on_outcome(record, was_correct, price_change_pct, reward)
|
||||
|
||||
logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} "
|
||||
f"({prediction['action']}, {price_change_pct:.2f}% change)")
|
||||
f"({prediction['action']}, {price_change_pct:.2f}% change, "
|
||||
f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error evaluating and training on record: {e}")
|
||||
|
||||
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float):
|
||||
"""Train specific model based on prediction outcome"""
|
||||
def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float,
|
||||
price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]:
|
||||
"""
|
||||
Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
|
||||
|
||||
Args:
|
||||
predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
|
||||
prediction_confidence: Model's confidence in the prediction (0.0 to 1.0)
|
||||
price_change_pct: Actual price change percentage
|
||||
time_diff_minutes: Time elapsed since prediction
|
||||
|
||||
Returns:
|
||||
tuple: (reward, was_correct)
|
||||
"""
|
||||
try:
|
||||
# Base thresholds for determining correctness
|
||||
movement_threshold = 0.1 # 0.1% minimum movement to consider significant
|
||||
|
||||
# Determine if prediction was directionally correct
|
||||
was_correct = False
|
||||
directional_accuracy = 0.0
|
||||
|
||||
if predicted_action == 'BUY':
|
||||
was_correct = price_change_pct > movement_threshold
|
||||
directional_accuracy = max(0, price_change_pct) # Positive for upward movement
|
||||
elif predicted_action == 'SELL':
|
||||
was_correct = price_change_pct < -movement_threshold
|
||||
directional_accuracy = max(0, -price_change_pct) # Positive for downward movement
|
||||
elif predicted_action == 'HOLD':
|
||||
was_correct = abs(price_change_pct) < movement_threshold
|
||||
directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) # Positive for stability
|
||||
|
||||
# Calculate magnitude-based multiplier (higher rewards for larger correct movements)
|
||||
magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0) # Cap at 3x for 6% moves
|
||||
|
||||
# Calculate confidence-based reward adjustment
|
||||
if was_correct:
|
||||
# Reward confident correct predictions more, penalize unconfident correct predictions less
|
||||
confidence_multiplier = 0.5 + (prediction_confidence * 1.5) # Range: 0.5 to 2.0
|
||||
base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier
|
||||
|
||||
# Bonus for high-confidence correct predictions with large movements
|
||||
if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0:
|
||||
base_reward *= 1.5 # 50% bonus for very confident + large movement
|
||||
|
||||
else:
|
||||
# Penalize incorrect predictions more severely if they were confident
|
||||
confidence_penalty = 0.5 + (prediction_confidence * 1.5) # Higher confidence = higher penalty
|
||||
base_penalty = abs(price_change_pct) * confidence_penalty
|
||||
|
||||
# Extra penalty for very confident wrong predictions
|
||||
if prediction_confidence > 0.8:
|
||||
base_penalty *= 2.0 # Double penalty for overconfident wrong predictions
|
||||
|
||||
base_reward = -base_penalty
|
||||
|
||||
# Time decay factor (predictions should be evaluated quickly)
|
||||
time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0)) # Decay over 1 hour, min 10%
|
||||
|
||||
# Final reward calculation
|
||||
final_reward = base_reward * time_decay
|
||||
|
||||
# Clamp reward to reasonable range
|
||||
final_reward = max(-5.0, min(5.0, final_reward))
|
||||
|
||||
return final_reward, was_correct
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating sophisticated reward: {e}")
|
||||
# Fallback to simple reward
|
||||
simple_correct = (
|
||||
(predicted_action == 'BUY' and price_change_pct > 0.1) or
|
||||
(predicted_action == 'SELL' and price_change_pct < -0.1) or
|
||||
(predicted_action == 'HOLD' and abs(price_change_pct) < 0.1)
|
||||
)
|
||||
return (1.0 if simple_correct else -0.5, simple_correct)
|
||||
|
||||
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None):
|
||||
"""Train specific model based on prediction outcome with sophisticated reward system"""
|
||||
try:
|
||||
model_name = record['model_name']
|
||||
model_input = record['model_input']
|
||||
prediction = record['prediction']
|
||||
|
||||
# Create training signal based on outcome
|
||||
reward = 1.0 if was_correct else -0.5
|
||||
# Use sophisticated reward if provided, otherwise fallback to simple reward
|
||||
reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5)
|
||||
|
||||
# Train RL models
|
||||
if 'dqn' in model_name.lower() and self.rl_agent:
|
||||
@ -1700,14 +1779,14 @@ class TradingOrchestrator:
|
||||
next_state=model_input, # Simplified
|
||||
done=True
|
||||
)
|
||||
logger.debug(f"Added RL training experience: reward={reward}")
|
||||
logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)")
|
||||
|
||||
# Train CNN models using adapter
|
||||
elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
|
||||
# Use the adapter's add_training_sample method
|
||||
actual_action = prediction['action']
|
||||
self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
|
||||
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}")
|
||||
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)")
|
||||
|
||||
# Trigger training if we have enough samples
|
||||
if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size:
|
||||
|
Reference in New Issue
Block a user