training: conviction-aware reward shaping
This commit is contained in:
@@ -3834,17 +3834,48 @@ class TradingOrchestrator:
|
||||
base_reward = -0.1 * prediction_confidence
|
||||
logger.debug(f"NOISE INCORRECT: Wrong direction on noise movement = {base_reward:.2f}")
|
||||
|
||||
# POSITION-AWARE ADJUSTMENTS
|
||||
# POSITION-AWARE ADJUSTMENTS (conviction-aware; learned bias via reward shaping)
|
||||
if has_position:
|
||||
# Adjust rewards based on current position status
|
||||
if current_position_pnl > 0.5: # Profitable position
|
||||
# Derive conviction from prediction_confidence (0..1)
|
||||
conviction = max(0.0, min(1.0, float(prediction_confidence)))
|
||||
# Estimate expected move magnitude if provided by vector; else 0
|
||||
expected_move_pct = 0.0
|
||||
try:
|
||||
if predicted_price_vector and isinstance(predicted_price_vector, dict):
|
||||
# Accept either a normalized magnitude or compute from price fields if present
|
||||
if 'expected_move_pct' in predicted_price_vector:
|
||||
expected_move_pct = float(predicted_price_vector.get('expected_move_pct', 0.0))
|
||||
elif 'predicted_price' in predicted_price_vector and 'current_price' in predicted_price_vector:
|
||||
cp = float(predicted_price_vector.get('current_price') or 0.0)
|
||||
pp = float(predicted_price_vector.get('predicted_price') or 0.0)
|
||||
if cp > 0 and pp > 0:
|
||||
expected_move_pct = ((pp - cp) / cp) * 100.0
|
||||
except Exception:
|
||||
expected_move_pct = 0.0
|
||||
|
||||
# Normalize expected move impact into [0,1]
|
||||
expected_move_norm = max(0.0, min(1.0, abs(expected_move_pct) / 2.0)) # 2% move caps to 1.0
|
||||
|
||||
# Conviction-tolerant drawdown penalty (cut losers early unless strong conviction for recovery)
|
||||
if current_position_pnl < 0:
|
||||
pnl_loss = abs(current_position_pnl)
|
||||
# Scale negative PnL into [0,1] using a soft scale (1% -> 1.0 cap)
|
||||
loss_norm = max(0.0, min(1.0, pnl_loss / 1.0))
|
||||
tolerance = (1.0 - min(0.9, conviction * expected_move_norm)) # high conviction reduces penalty
|
||||
penalty = loss_norm * tolerance
|
||||
base_reward -= 1.0 * penalty
|
||||
logger.debug(
|
||||
f"CONVICTION DRAWdown: pnl={current_position_pnl:.3f}, conv={conviction:.2f}, exp={expected_move_norm:.2f}, penalty={penalty:.3f}"
|
||||
)
|
||||
else:
|
||||
# Let winners run when conviction supports it
|
||||
gain = max(0.0, current_position_pnl)
|
||||
gain_norm = max(0.0, min(1.0, gain / 1.0))
|
||||
run_bonus = 0.2 * gain_norm * (0.5 + 0.5 * conviction)
|
||||
# Small nudge to keep holding if directionally correct
|
||||
if predicted_action == "HOLD" and price_change_pct > 0:
|
||||
base_reward += 0.5 # Bonus for holding profitable position during uptrend
|
||||
logger.debug(f"POSITION BONUS: Holding profitable position during uptrend = +0.5")
|
||||
elif current_position_pnl < -0.5: # Losing position
|
||||
if predicted_action in ["BUY", "SELL"] and directional_correct:
|
||||
base_reward += 0.3 # Bonus for taking action to exit losing position
|
||||
logger.debug(f"EXIT BONUS: Taking action on losing position = +0.3")
|
||||
base_reward += run_bonus
|
||||
logger.debug(f"RUN BONUS: gain={gain:.3f}, conv={conviction:.2f}, bonus={run_bonus:.3f}")
|
||||
|
||||
# PRICE VECTOR BONUS (if available)
|
||||
if predicted_price_vector and isinstance(predicted_price_vector, dict):
|
||||
|
Reference in New Issue
Block a user