PnL in reward, show leveraged power in dash (broken)

2025-07-29 17:42:00 +03:00
parent d35530a9e9
commit 3a532a1220
5 changed files with 553 additions and 49 deletions
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -3267,12 +3267,15 @@ class TradingOrchestrator:
            avg_loss = model_stats.average_loss if model_stats else None

            # Calculate reward for logging
+            current_pnl = self._get_current_position_pnl(self.symbol)
            reward, _ = self._calculate_sophisticated_reward(
                predicted_action,
                predicted_confidence,
                actual_price_change_pct,
                time_diff_seconds / 60,  # Convert to minutes
                has_price_prediction=predicted_price is not None,
+                symbol=self.symbol,
+                current_position_pnl=current_pnl,
            )

            # Enhanced logging with detailed information
@@ -3361,6 +3364,7 @@ class TradingOrchestrator:
            )  # Default to 0.5 if missing

            # Calculate sophisticated reward based on multiple factors
+            current_pnl = self._get_current_position_pnl(symbol)
            reward, was_correct = self._calculate_sophisticated_reward(
                predicted_action,
                prediction_confidence,
@@ -3369,6 +3373,7 @@ class TradingOrchestrator:
                inference_price is not None,  # Add price prediction flag
                symbol,  # Pass symbol for position lookup
                None,  # Let method determine position status
+                current_position_pnl=current_pnl,
            )

            # Update model performance tracking
@@ -3476,10 +3481,11 @@ class TradingOrchestrator:
        has_price_prediction: bool = False,
        symbol: str = None,
        has_position: bool = None,
+        current_position_pnl: float = 0.0,
    ) -> tuple[float, bool]:
        """
        Calculate sophisticated reward based on prediction accuracy, confidence, and price movement magnitude
-        Now considers position status when evaluating HOLD decisions
+        Now considers position status and current P&L when evaluating decisions

        Args:
            predicted_action: The predicted action ('BUY', 'SELL', 'HOLD')
@@ -3489,6 +3495,7 @@ class TradingOrchestrator:
            has_price_prediction: Whether the model made a price prediction
            symbol: Trading symbol (for position lookup)
            has_position: Whether we currently have a position (if None, will be looked up)
+            current_position_pnl: Current unrealized P&L of open position (0.0 if no position)

        Returns:
            tuple: (reward, was_correct)
@@ -3500,6 +3507,9 @@ class TradingOrchestrator:
            # Determine current position status if not provided
            if has_position is None and symbol:
                has_position = self._has_open_position(symbol)
+                # Get current position P&L if we have a position
+                if has_position and current_position_pnl == 0.0:
+                    current_position_pnl = self._get_current_position_pnl(symbol)
            elif has_position is None:
                has_position = False

@@ -3518,19 +3528,37 @@ class TradingOrchestrator:
                    0, -price_change_pct
                )  # Positive for downward movement
            elif predicted_action == "HOLD":
-                # HOLD evaluation now considers position status
+                # HOLD evaluation now considers position status AND current P&L
                if has_position:
-                    # If we have a position, HOLD is correct if price moved favorably or stayed stable
-                    # This prevents penalizing HOLD when we're already in a profitable position
-                    if price_change_pct > 0:  # Price went up while holding - good
-                        was_correct = True
-                        directional_accuracy = price_change_pct  # Reward based on profit
-                    elif abs(price_change_pct) < movement_threshold:  # Price stable - neutral
-                        was_correct = True
-                        directional_accuracy = movement_threshold - abs(price_change_pct)
-                    else:  # Price dropped while holding - bad, but less penalty than wrong direction
-                        was_correct = False
-                        directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) * 0.5
+                    # If we have a position, HOLD evaluation depends on P&L and price movement
+                    if current_position_pnl > 0:  # Currently profitable position
+                        # Holding a profitable position is good if price continues favorably
+                        if price_change_pct > 0:  # Price went up while holding profitable position - excellent
+                            was_correct = True
+                            directional_accuracy = price_change_pct * 1.5  # Bonus for holding winners
+                        elif abs(price_change_pct) < movement_threshold:  # Price stable - good
+                            was_correct = True
+                            directional_accuracy = movement_threshold + (current_position_pnl / 100.0)  # Reward based on existing profit
+                        else:  # Price dropped while holding profitable position - still okay but less reward
+                            was_correct = True
+                            directional_accuracy = max(0, (current_position_pnl / 100.0) - abs(price_change_pct) * 0.5)
+                    elif current_position_pnl < 0:  # Currently losing position
+                        # Holding a losing position is generally bad - should consider closing
+                        if price_change_pct > movement_threshold:  # Price recovered - good hold
+                            was_correct = True
+                            directional_accuracy = price_change_pct * 0.8  # Reduced reward for recovery
+                        else:  # Price continued down or stayed flat - bad hold
+                            was_correct = False
+                            # Penalty proportional to loss magnitude
+                            directional_accuracy = abs(current_position_pnl / 100.0) * 0.5  # Penalty for holding losers
+                    else:  # Breakeven position
+                        # Standard HOLD evaluation for breakeven positions
+                        if abs(price_change_pct) < movement_threshold:  # Price stable - good
+                            was_correct = True
+                            directional_accuracy = movement_threshold - abs(price_change_pct)
+                        else:  # Price moved significantly - missed opportunity
+                            was_correct = False
+                            directional_accuracy = max(0, movement_threshold - abs(price_change_pct)) * 0.7
                else:
                    # If we don't have a position, HOLD is correct if price stayed relatively stable
                    was_correct = abs(price_change_pct) < movement_threshold
@@ -3627,12 +3655,16 @@ class TradingOrchestrator:

            # Calculate reward if not provided
            if sophisticated_reward is None:
+                symbol = record.get("symbol", self.symbol)
+                current_pnl = self._get_current_position_pnl(symbol)
                sophisticated_reward, _ = self._calculate_sophisticated_reward(
                    record.get("action", "HOLD"),
                    record.get("confidence", 0.5),
                    price_change_pct,
                    record.get("time_diff_minutes", 1.0),
                    record.get("has_price_prediction", False),
+                    symbol=symbol,
+                    current_position_pnl=current_pnl,
                )

            # Train decision fusion model if it's the model being evaluated
@@ -6510,7 +6542,7 @@ class TradingOrchestrator:
            logger.error(f"Error getting combined model data for {symbol}: {e}")
            return None

-    def _get_current_position_pnl(self, symbol: str, current_price: float) -> float:
+    def _get_current_position_pnl(self, symbol: str, current_price: float = None) -> float:
        """Get current position P&L for the symbol"""
        try:
            if self.trading_executor and hasattr(
@@ -6518,16 +6550,22 @@ class TradingOrchestrator:
            ):
                position = self.trading_executor.get_current_position(symbol)
                if position:
-                    entry_price = position.get("price", 0)
-                    size = position.get("size", 0)
-                    side = position.get("side", "LONG")
+                    # If current_price is provided, calculate P&L manually
+                    if current_price is not None:
+                        entry_price = position.get("price", 0)
+                        size = position.get("size", 0)
+                        side = position.get("side", "LONG")

-                    if entry_price and size > 0:
-                        if side.upper() == "LONG":
-                            pnl = (current_price - entry_price) * size
-                        else:  # SHORT
-                            pnl = (entry_price - current_price) * size
-                        return pnl
+                        if entry_price and size > 0:
+                            if side.upper() == "LONG":
+                                pnl = (current_price - entry_price) * size
+                            else:  # SHORT
+                                pnl = (entry_price - current_price) * size
+                            return pnl
+                    else:
+                        # Use unrealized_pnl from position if available
+                        if position.get("size", 0) > 0:
+                            return float(position.get("unrealized_pnl", 0.0))
            return 0.0
        except Exception as e:
            logger.debug(f"Error getting position P&L for {symbol}: {e}")
@@ -6545,6 +6583,53 @@ class TradingOrchestrator:
        except Exception:
            return False

+
+
+    def _calculate_position_enhanced_reward_for_dqn(self, base_reward, action, position_pnl, has_position):
+        """
+        Calculate position-enhanced reward for DQN to incentivize profitable trades and closing losing ones
+        
+        Args:
+            base_reward: Original reward from confidence/execution
+            action: Action taken ('BUY', 'SELL', 'HOLD')
+            position_pnl: Current position P&L
+            has_position: Whether we have an open position
+            
+        Returns:
+            Enhanced reward that incentivizes profitable behavior
+        """
+        try:
+            enhanced_reward = base_reward
+            
+            if has_position and position_pnl != 0.0:
+                # Position-based reward adjustments (similar to CNN but tuned for DQN)
+                pnl_factor = position_pnl / 100.0  # Normalize P&L to reasonable scale
+                
+                if position_pnl > 0:  # Profitable position
+                    if action == "HOLD":
+                        # Reward holding profitable positions (let winners run)
+                        enhanced_reward += abs(pnl_factor) * 0.4
+                    elif action in ["BUY", "SELL"]:
+                        # Moderate reward for taking action on profitable positions
+                        enhanced_reward += abs(pnl_factor) * 0.2
+                        
+                elif position_pnl < 0:  # Losing position
+                    if action == "HOLD":
+                        # Strong penalty for holding losing positions (cut losses)
+                        enhanced_reward -= abs(pnl_factor) * 1.0
+                    elif action in ["BUY", "SELL"]:
+                        # Strong reward for taking action to close losing positions
+                        enhanced_reward += abs(pnl_factor) * 0.8
+                        
+            # Ensure reward doesn't become extreme (DQN is more sensitive to reward scale)
+            enhanced_reward = max(-2.0, min(2.0, enhanced_reward))
+            
+            return enhanced_reward
+            
+        except Exception as e:
+            logger.error(f"Error calculating position-enhanced reward for DQN: {e}")
+            return base_reward
+
    def _close_all_positions(self):
        """Close all open positions when clearing session"""
        try:
@@ -6889,28 +6974,35 @@ class TradingOrchestrator:
                                action_mapping = {"BUY": 0, "SELL": 1, "HOLD": 2}
                                dqn_action = action_mapping.get(action, 2)

-                                # Calculate immediate reward based on confidence and execution
-                                immediate_reward = confidence if action != "HOLD" else 0.0
+                                # Get position information for enhanced rewards
+                                has_position = self._has_open_position(symbol)
+                                position_pnl = self._get_current_position_pnl(symbol) if has_position else 0.0
+
+                                # Calculate position-enhanced reward
+                                base_reward = confidence if action != "HOLD" else 0.1
+                                enhanced_reward = self._calculate_position_enhanced_reward_for_dqn(
+                                    base_reward, action, position_pnl, has_position
+                                )

                                # Add experience to DQN
                                self.rl_agent.remember(
                                    state=state,
                                    action=dqn_action,
-                                    reward=immediate_reward,
+                                    reward=enhanced_reward,
                                    next_state=state,  # Will be updated with actual outcome later
                                    done=False,
                                )

                                models_trained.append("dqn")
                                logger.debug(
-                                    f"🧠 Added DQN experience: {action} {symbol} (reward: {immediate_reward:.3f})"
+                                    f"🧠 Added DQN experience: {action} {symbol} (reward: {enhanced_reward:.3f}, P&L: ${position_pnl:.2f})"
                                )

                except Exception as e:
                    logger.debug(f"Error training DQN on decision: {e}")

            # Train CNN model if available and enabled
-            if self.cnn_model and hasattr(self.cnn_model, "add_training_sample") and self.is_model_training_enabled("cnn"):
+            if self.cnn_model and hasattr(self.cnn_model, "add_training_data") and self.is_model_training_enabled("cnn"):
                try:
                    # Create CNN input features from base_data (same as inference)
                    cnn_features = self._create_cnn_features_from_base_data(
@@ -6919,19 +7011,30 @@ class TradingOrchestrator:

                    # Create target based on action
                    target_mapping = {
-                        "BUY": [1, 0, 0],
-                        "SELL": [0, 1, 0],
-                        "HOLD": [0, 0, 1],
+                        "BUY": 0,  # Action indices for CNN
+                        "SELL": 1,
+                        "HOLD": 2,
                    }
-                    target = target_mapping.get(action, [0, 0, 1])
+                    target_action = target_mapping.get(action, 2)

-                    # Add training sample
-                    self.cnn_model.add_training_sample(
-                        cnn_features, target, weight=confidence
+                    # Get position information for enhanced rewards
+                    has_position = self._has_open_position(symbol)
+                    position_pnl = self._get_current_position_pnl(symbol) if has_position else 0.0
+
+                    # Calculate base reward from confidence and add position-based enhancement
+                    base_reward = confidence if action != "HOLD" else 0.1
+
+                    # Add training data with position-based reward enhancement
+                    self.cnn_model.add_training_data(
+                        cnn_features, 
+                        target_action, 
+                        base_reward,
+                        position_pnl=position_pnl,
+                        has_position=has_position
                    )

                    models_trained.append("cnn")
-                    logger.debug(f"🔍 Added CNN training sample: {action} {symbol}")
+                    logger.debug(f"🔍 Added CNN training sample: {action} {symbol} (P&L: ${position_pnl:.2f})")

                except Exception as e:
                    logger.debug(f"Error training CNN on decision: {e}")