improved reward/penalty
This commit is contained in:
@ -1646,16 +1646,17 @@ class TradingOrchestrator:
|
|||||||
prediction_price = historical_data['close'].iloc[-1] # Simplified
|
prediction_price = historical_data['close'].iloc[-1] # Simplified
|
||||||
price_change_pct = (current_price - prediction_price) / prediction_price * 100
|
price_change_pct = (current_price - prediction_price) / prediction_price * 100
|
||||||
|
|
||||||
# Determine if prediction was correct
|
# Enhanced reward system based on prediction confidence and price movement magnitude
|
||||||
predicted_action = prediction['action']
|
predicted_action = prediction['action']
|
||||||
was_correct = False
|
prediction_confidence = prediction.get('confidence', 0.5) # Default to 0.5 if missing
|
||||||
|
|
||||||
if predicted_action == 'BUY' and price_change_pct > 0.1: # Price went up
|
# Calculate sophisticated reward based on multiple factors
|
||||||
was_correct = True
|
reward, was_correct = self._calculate_sophisticated_reward(
|
||||||
elif predicted_action == 'SELL' and price_change_pct < -0.1: # Price went down
|
predicted_action,
|
||||||
was_correct = True
|
prediction_confidence,
|
||||||
elif predicted_action == 'HOLD' and abs(price_change_pct) < 0.1: # Price stayed stable
|
price_change_pct,
|
||||||
was_correct = True
|
time_diff
|
||||||
|
)
|
||||||
|
|
||||||
# Update model performance tracking
|
# Update model performance tracking
|
||||||
if model_name not in self.model_performance:
|
if model_name not in self.model_performance:
|
||||||
@ -1670,24 +1671,102 @@ class TradingOrchestrator:
|
|||||||
self.model_performance[model_name]['total']
|
self.model_performance[model_name]['total']
|
||||||
)
|
)
|
||||||
|
|
||||||
# Train the specific model based on outcome
|
# Train the specific model based on sophisticated outcome
|
||||||
await self._train_model_on_outcome(record, was_correct, price_change_pct)
|
await self._train_model_on_outcome(record, was_correct, price_change_pct, reward)
|
||||||
|
|
||||||
logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} "
|
logger.debug(f"Evaluated {model_name} prediction: {'✓' if was_correct else '✗'} "
|
||||||
f"({prediction['action']}, {price_change_pct:.2f}% change)")
|
f"({prediction['action']}, {price_change_pct:.2f}% change, "
|
||||||
|
f"confidence: {prediction_confidence:.3f}, reward: {reward:.3f})")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error evaluating and training on record: {e}")
|
logger.error(f"Error evaluating and training on record: {e}")
|
||||||
|
|
||||||
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float):
|
def _calculate_sophisticated_reward(self, predicted_action: str, prediction_confidence: float,
|
||||||
"""Train specific model based on prediction outcome"""
|
price_change_pct: float, time_diff_minutes: float) -> tuple[float, bool]:
|
||||||
|
"""
|
||||||
|
Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
|
||||||
|
|
||||||
|
Args:
|
||||||
|
predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
|
||||||
|
prediction_confidence: Model's confidence in the prediction (0.0 to 1.0)
|
||||||
|
price_change_pct: Actual price change percentage
|
||||||
|
time_diff_minutes: Time elapsed since prediction
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (reward, was_correct)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Base thresholds for determining correctness
|
||||||
|
movement_threshold = 0.1 # 0.1% minimum movement to consider significant
|
||||||
|
|
||||||
|
# Determine if prediction was directionally correct
|
||||||
|
was_correct = False
|
||||||
|
directional_accuracy = 0.0
|
||||||
|
|
||||||
|
if predicted_action == 'BUY':
|
||||||
|
was_correct = price_change_pct > movement_threshold
|
||||||
|
directional_accuracy = max(0, price_change_pct) # Positive for upward movement
|
||||||
|
elif predicted_action == 'SELL':
|
||||||
|
was_correct = price_change_pct < -movement_threshold
|
||||||
|
directional_accuracy = max(0, -price_change_pct) # Positive for downward movement
|
||||||
|
elif predicted_action == 'HOLD':
|
||||||
|
was_correct = abs(price_change_pct) < movement_threshold
|
||||||
|
directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) # Positive for stability
|
||||||
|
|
||||||
|
# Calculate magnitude-based multiplier (higher rewards for larger correct movements)
|
||||||
|
magnitude_multiplier = min(abs(price_change_pct) / 2.0, 3.0) # Cap at 3x for 6% moves
|
||||||
|
|
||||||
|
# Calculate confidence-based reward adjustment
|
||||||
|
if was_correct:
|
||||||
|
# Reward confident correct predictions more, penalize unconfident correct predictions less
|
||||||
|
confidence_multiplier = 0.5 + (prediction_confidence * 1.5) # Range: 0.5 to 2.0
|
||||||
|
base_reward = directional_accuracy * magnitude_multiplier * confidence_multiplier
|
||||||
|
|
||||||
|
# Bonus for high-confidence correct predictions with large movements
|
||||||
|
if prediction_confidence > 0.8 and abs(price_change_pct) > 1.0:
|
||||||
|
base_reward *= 1.5 # 50% bonus for very confident + large movement
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Penalize incorrect predictions more severely if they were confident
|
||||||
|
confidence_penalty = 0.5 + (prediction_confidence * 1.5) # Higher confidence = higher penalty
|
||||||
|
base_penalty = abs(price_change_pct) * confidence_penalty
|
||||||
|
|
||||||
|
# Extra penalty for very confident wrong predictions
|
||||||
|
if prediction_confidence > 0.8:
|
||||||
|
base_penalty *= 2.0 # Double penalty for overconfident wrong predictions
|
||||||
|
|
||||||
|
base_reward = -base_penalty
|
||||||
|
|
||||||
|
# Time decay factor (predictions should be evaluated quickly)
|
||||||
|
time_decay = max(0.1, 1.0 - (time_diff_minutes / 60.0)) # Decay over 1 hour, min 10%
|
||||||
|
|
||||||
|
# Final reward calculation
|
||||||
|
final_reward = base_reward * time_decay
|
||||||
|
|
||||||
|
# Clamp reward to reasonable range
|
||||||
|
final_reward = max(-5.0, min(5.0, final_reward))
|
||||||
|
|
||||||
|
return final_reward, was_correct
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calculating sophisticated reward: {e}")
|
||||||
|
# Fallback to simple reward
|
||||||
|
simple_correct = (
|
||||||
|
(predicted_action == 'BUY' and price_change_pct > 0.1) or
|
||||||
|
(predicted_action == 'SELL' and price_change_pct < -0.1) or
|
||||||
|
(predicted_action == 'HOLD' and abs(price_change_pct) < 0.1)
|
||||||
|
)
|
||||||
|
return (1.0 if simple_correct else -0.5, simple_correct)
|
||||||
|
|
||||||
|
async def _train_model_on_outcome(self, record: Dict, was_correct: bool, price_change_pct: float, sophisticated_reward: float = None):
|
||||||
|
"""Train specific model based on prediction outcome with sophisticated reward system"""
|
||||||
try:
|
try:
|
||||||
model_name = record['model_name']
|
model_name = record['model_name']
|
||||||
model_input = record['model_input']
|
model_input = record['model_input']
|
||||||
prediction = record['prediction']
|
prediction = record['prediction']
|
||||||
|
|
||||||
# Create training signal based on outcome
|
# Use sophisticated reward if provided, otherwise fallback to simple reward
|
||||||
reward = 1.0 if was_correct else -0.5
|
reward = sophisticated_reward if sophisticated_reward is not None else (1.0 if was_correct else -0.5)
|
||||||
|
|
||||||
# Train RL models
|
# Train RL models
|
||||||
if 'dqn' in model_name.lower() and self.rl_agent:
|
if 'dqn' in model_name.lower() and self.rl_agent:
|
||||||
@ -1700,14 +1779,14 @@ class TradingOrchestrator:
|
|||||||
next_state=model_input, # Simplified
|
next_state=model_input, # Simplified
|
||||||
done=True
|
done=True
|
||||||
)
|
)
|
||||||
logger.debug(f"Added RL training experience: reward={reward}")
|
logger.debug(f"Added RL training experience: reward={reward:.3f} (sophisticated)")
|
||||||
|
|
||||||
# Train CNN models using adapter
|
# Train CNN models using adapter
|
||||||
elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
|
elif 'cnn' in model_name.lower() and hasattr(self, 'cnn_adapter') and self.cnn_adapter:
|
||||||
# Use the adapter's add_training_sample method
|
# Use the adapter's add_training_sample method
|
||||||
actual_action = prediction['action']
|
actual_action = prediction['action']
|
||||||
self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
|
self.cnn_adapter.add_training_sample(record['symbol'], actual_action, reward)
|
||||||
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward}")
|
logger.debug(f"Added CNN training sample: action={actual_action}, reward={reward:.3f} (sophisticated)")
|
||||||
|
|
||||||
# Trigger training if we have enough samples
|
# Trigger training if we have enough samples
|
||||||
if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size:
|
if len(self.cnn_adapter.training_data) >= self.cnn_adapter.batch_size:
|
||||||
|
Reference in New Issue
Block a user