training: conviction-aware reward shaping

2025-08-10 13:23:29 +03:00
parent 6861d0f20b
commit b3c5076e37
2 changed files with 69 additions and 10 deletions
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -3834,17 +3834,48 @@ class TradingOrchestrator:
                base_reward = -0.1 * prediction_confidence
                logger.debug(f"NOISE INCORRECT: Wrong direction on noise movement = {base_reward:.2f}")
            
-            # POSITION-AWARE ADJUSTMENTS
+            # POSITION-AWARE ADJUSTMENTS (conviction-aware; learned bias via reward shaping)
            if has_position:
-                # Adjust rewards based on current position status
-                if current_position_pnl > 0.5:  # Profitable position
+                # Derive conviction from prediction_confidence (0..1)
+                conviction = max(0.0, min(1.0, float(prediction_confidence)))
+                # Estimate expected move magnitude if provided by vector; else 0
+                expected_move_pct = 0.0
+                try:
+                    if predicted_price_vector and isinstance(predicted_price_vector, dict):
+                        # Accept either a normalized magnitude or compute from price fields if present
+                        if 'expected_move_pct' in predicted_price_vector:
+                            expected_move_pct = float(predicted_price_vector.get('expected_move_pct', 0.0))
+                        elif 'predicted_price' in predicted_price_vector and 'current_price' in predicted_price_vector:
+                            cp = float(predicted_price_vector.get('current_price') or 0.0)
+                            pp = float(predicted_price_vector.get('predicted_price') or 0.0)
+                            if cp > 0 and pp > 0:
+                                expected_move_pct = ((pp - cp) / cp) * 100.0
+                except Exception:
+                    expected_move_pct = 0.0
+
+                # Normalize expected move impact into [0,1]
+                expected_move_norm = max(0.0, min(1.0, abs(expected_move_pct) / 2.0))  # 2% move caps to 1.0
+
+                # Conviction-tolerant drawdown penalty (cut losers early unless strong conviction for recovery)
+                if current_position_pnl < 0:
+                    pnl_loss = abs(current_position_pnl)
+                    # Scale negative PnL into [0,1] using a soft scale (1% -> 1.0 cap)
+                    loss_norm = max(0.0, min(1.0, pnl_loss / 1.0))
+                    tolerance = (1.0 - min(0.9, conviction * expected_move_norm))  # high conviction reduces penalty
+                    penalty = loss_norm * tolerance
+                    base_reward -= 1.0 * penalty
+                    logger.debug(
+                        f"CONVICTION DRAWdown: pnl={current_position_pnl:.3f}, conv={conviction:.2f}, exp={expected_move_norm:.2f}, penalty={penalty:.3f}"
+                    )
+                else:
+                    # Let winners run when conviction supports it
+                    gain = max(0.0, current_position_pnl)
+                    gain_norm = max(0.0, min(1.0, gain / 1.0))
+                    run_bonus = 0.2 * gain_norm * (0.5 + 0.5 * conviction)
+                    # Small nudge to keep holding if directionally correct
                    if predicted_action == "HOLD" and price_change_pct > 0:
-                        base_reward += 0.5  # Bonus for holding profitable position during uptrend
-                        logger.debug(f"POSITION BONUS: Holding profitable position during uptrend = +0.5")
-                elif current_position_pnl < -0.5:  # Losing position
-                    if predicted_action in ["BUY", "SELL"] and directional_correct:
-                        base_reward += 0.3  # Bonus for taking action to exit losing position
-                        logger.debug(f"EXIT BONUS: Taking action on losing position = +0.3")
+                        base_reward += run_bonus
+                        logger.debug(f"RUN BONUS: gain={gain:.3f}, conv={conviction:.2f}, bonus={run_bonus:.3f}")
            
            # PRICE VECTOR BONUS (if available)
            if predicted_price_vector and isinstance(predicted_price_vector, dict):