training: conviction-aware reward shaping

This commit is contained in:
Dobromir Popov
2025-08-10 13:23:29 +03:00
parent 6861d0f20b
commit b3c5076e37
2 changed files with 69 additions and 10 deletions

View File

@@ -3834,17 +3834,48 @@ class TradingOrchestrator:
base_reward = -0.1 * prediction_confidence
logger.debug(f"NOISE INCORRECT: Wrong direction on noise movement = {base_reward:.2f}")
# POSITION-AWARE ADJUSTMENTS
# POSITION-AWARE ADJUSTMENTS (conviction-aware; learned bias via reward shaping)
if has_position:
# Adjust rewards based on current position status
if current_position_pnl > 0.5: # Profitable position
# Derive conviction from prediction_confidence (0..1)
conviction = max(0.0, min(1.0, float(prediction_confidence)))
# Estimate expected move magnitude if provided by vector; else 0
expected_move_pct = 0.0
try:
if predicted_price_vector and isinstance(predicted_price_vector, dict):
# Accept either a normalized magnitude or compute from price fields if present
if 'expected_move_pct' in predicted_price_vector:
expected_move_pct = float(predicted_price_vector.get('expected_move_pct', 0.0))
elif 'predicted_price' in predicted_price_vector and 'current_price' in predicted_price_vector:
cp = float(predicted_price_vector.get('current_price') or 0.0)
pp = float(predicted_price_vector.get('predicted_price') or 0.0)
if cp > 0 and pp > 0:
expected_move_pct = ((pp - cp) / cp) * 100.0
except Exception:
expected_move_pct = 0.0
# Normalize expected move impact into [0,1]
expected_move_norm = max(0.0, min(1.0, abs(expected_move_pct) / 2.0)) # 2% move caps to 1.0
# Conviction-tolerant drawdown penalty (cut losers early unless strong conviction for recovery)
if current_position_pnl < 0:
pnl_loss = abs(current_position_pnl)
# Scale negative PnL into [0,1] using a soft scale (1% -> 1.0 cap)
loss_norm = max(0.0, min(1.0, pnl_loss / 1.0))
tolerance = (1.0 - min(0.9, conviction * expected_move_norm)) # high conviction reduces penalty
penalty = loss_norm * tolerance
base_reward -= 1.0 * penalty
logger.debug(
f"CONVICTION DRAWdown: pnl={current_position_pnl:.3f}, conv={conviction:.2f}, exp={expected_move_norm:.2f}, penalty={penalty:.3f}"
)
else:
# Let winners run when conviction supports it
gain = max(0.0, current_position_pnl)
gain_norm = max(0.0, min(1.0, gain / 1.0))
run_bonus = 0.2 * gain_norm * (0.5 + 0.5 * conviction)
# Small nudge to keep holding if directionally correct
if predicted_action == "HOLD" and price_change_pct > 0:
base_reward += 0.5 # Bonus for holding profitable position during uptrend
logger.debug(f"POSITION BONUS: Holding profitable position during uptrend = +0.5")
elif current_position_pnl < -0.5: # Losing position
if predicted_action in ["BUY", "SELL"] and directional_correct:
base_reward += 0.3 # Bonus for taking action to exit losing position
logger.debug(f"EXIT BONUS: Taking action on losing position = +0.3")
base_reward += run_bonus
logger.debug(f"RUN BONUS: gain={gain:.3f}, conv={conviction:.2f}, bonus={run_bonus:.3f}")
# PRICE VECTOR BONUS (if available)
if predicted_price_vector and isinstance(predicted_price_vector, dict):