fix DQN RL inference, rebuild model

2025-07-26 23:57:03 +03:00
parent 87942d3807
commit 36a8e256a8
2 changed files with 201 additions and 98 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -21,6 +21,112 @@ from utils.training_integration import get_training_integration
 # Configure logger
 logger = logging.getLogger(__name__)

+class DQNNetwork(nn.Module):
+    """
+    Deep Q-Network specifically designed for RL trading with unified BaseDataInput features
+    Handles 7850 input features from multi-timeframe, multi-asset data
+    """
+    def __init__(self, input_dim: int, n_actions: int):
+        super(DQNNetwork, self).__init__()
+        
+        # Handle different input dimension formats
+        if isinstance(input_dim, (tuple, list)):
+            if len(input_dim) == 1:
+                self.input_size = input_dim[0]
+            else:
+                self.input_size = np.prod(input_dim)  # Flatten multi-dimensional input
+        else:
+            self.input_size = input_dim
+        
+        self.n_actions = n_actions
+        
+        # Deep network architecture optimized for trading features
+        self.network = nn.Sequential(
+            # Input layer
+            nn.Linear(self.input_size, 2048),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            
+            # Hidden layers with residual-like connections
+            nn.Linear(2048, 1024),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            
+            # Output layer for Q-values
+            nn.Linear(128, n_actions)
+        )
+        
+        # Initialize weights
+        self._initialize_weights()
+    
+    def _initialize_weights(self):
+        """Initialize network weights using Xavier initialization"""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+    
+    def forward(self, x):
+        """Forward pass through the network"""
+        # Ensure input is properly shaped
+        if x.dim() > 2:
+            x = x.view(x.size(0), -1)  # Flatten if needed
+        elif x.dim() == 1:
+            x = x.unsqueeze(0)  # Add batch dimension if needed
+        
+        return self.network(x)
+    
+    def act(self, state, explore=True):
+        """
+        Select action using epsilon-greedy policy
+        
+        Args:
+            state: Current state (numpy array or tensor)
+            explore: Whether to use epsilon-greedy exploration
+            
+        Returns:
+            action_idx: Selected action index
+            confidence: Confidence score
+            action_probs: Action probabilities
+        """
+        # Convert state to tensor if needed
+        if isinstance(state, np.ndarray):
+            state = torch.FloatTensor(state).to(next(self.parameters()).device)
+        
+        # Ensure proper shape
+        if state.dim() == 1:
+            state = state.unsqueeze(0)
+        
+        with torch.no_grad():
+            q_values = self.forward(state)
+            
+            # Get action probabilities using softmax
+            action_probs = F.softmax(q_values, dim=1)
+            
+            # Select action (greedy for inference)
+            action_idx = torch.argmax(q_values, dim=1).item()
+            
+            # Calculate confidence as max probability
+            confidence = float(action_probs[0, action_idx].item())
+            
+            # Convert probabilities to list
+            probs_list = action_probs.squeeze(0).cpu().numpy().tolist()
+            
+            return action_idx, confidence, probs_list
+
 class DQNAgent:
    """
    Deep Q-Network agent for trading
@@ -80,12 +186,9 @@ class DQNAgent:
        else:
            self.device = device
        
-        # Initialize models with Enhanced CNN architecture for better performance
-        from NN.models.enhanced_cnn import EnhancedCNN
-        
-        # Use Enhanced CNN for both policy and target networks
-        self.policy_net = EnhancedCNN(self.state_dim, self.n_actions)
-        self.target_net = EnhancedCNN(self.state_dim, self.n_actions)
+        # Initialize models with RL-specific network architecture
+        self.policy_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device)
+        self.target_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device)
        
        # Initialize the target network with the same weights as the policy network
        self.target_net.load_state_dict(self.policy_net.state_dict())
@@ -578,83 +681,45 @@ class DQNAgent:
            market_context: Additional market context for decision making
            
        Returns:
-            int: Action (0=BUY, 1=SELL, 2=HOLD) or None if should hold position
+            int: Action (0=BUY, 1=SELL) 
        """
-        
-        # Convert state to tensor
-        if isinstance(state, np.ndarray):
-            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
-        else:
-            state_tensor = state.unsqueeze(0).to(self.device)
-        
-        # Get Q-values
-        policy_output = self.policy_net(state_tensor)
-        if isinstance(policy_output, dict):
-            q_values = policy_output.get('q_values', policy_output.get('Q_values', list(policy_output.values())[0]))
-        elif isinstance(policy_output, tuple):
-            q_values = policy_output[0]  # Assume first element is Q-values
-        else:
-            q_values = policy_output
-        action_values = q_values.cpu().data.numpy()[0]
-        
-        # Calculate confidence scores
-        # Ensure q_values has correct shape for softmax
-        if q_values.dim() == 1:
-            q_values = q_values.unsqueeze(0)
-        
-        # FIXED ACTION MAPPING: 0=BUY, 1=SELL, 2=HOLD
-        buy_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
-        sell_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
-        
-        # Determine action based on current position and confidence thresholds
-        action = self._determine_action_with_position_management(
-            sell_confidence, buy_confidence, current_price, market_context, explore
-        )
-        
-        # Update tracking
-        if current_price:
-            self.recent_prices.append(current_price)
-        
-        if action is not None:
-            self.recent_actions.append(action)
-            return action
-        else:
-            # Return 1 (HOLD) as a safe default if action is None
+        try:
+            # Use the DQNNetwork's act method for consistent behavior
+            action_idx, confidence, action_probs = self.policy_net.act(state, explore=explore)
+            
+            # Apply epsilon-greedy exploration if requested
+            if explore and np.random.random() <= self.epsilon:
+                action_idx = np.random.choice(self.n_actions)
+            
+            # Update tracking
+            if current_price:
+                self.recent_prices.append(current_price)
+            
+            self.recent_actions.append(action_idx)
+            return action_idx
+            
+        except Exception as e:
+            logger.error(f"Error in act method: {e}")
+            # Return default action (HOLD/SELL)
            return 1
    
-    def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float]:
-        """Choose action with confidence score adapted to market regime (from Enhanced DQN)"""
-        with torch.no_grad():
-            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
-            q_values = self.policy_net(state_tensor)
-            
-            # Handle case where network might return a tuple instead of tensor
-            if isinstance(q_values, tuple):
-                # If it's a tuple, take the first element (usually the main output)
-                q_values = q_values[0]
-            
-            # Ensure q_values is a tensor and has correct shape for softmax
-            if not hasattr(q_values, 'dim'):
-                logger.error(f"DQN: q_values is not a tensor: {type(q_values)}")
-                # Return default action with low confidence
-                return 1, 0.1  # Default to HOLD action
-            
-            if q_values.dim() == 1:
-                q_values = q_values.unsqueeze(0)
-            
-            # Convert Q-values to probabilities
-            action_probs = torch.softmax(q_values, dim=1)
-            action = q_values.argmax().item()
-            base_confidence = action_probs[0, action].item()
+    def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float, List[float]]:
+        """Choose action with confidence score adapted to market regime"""
+        try:
+            # Use the DQNNetwork's act method which handles the state properly
+            action_idx, base_confidence, action_probs = self.policy_net.act(state, explore=False)
            
            # Adapt confidence based on market regime
            regime_weight = self.market_regime_weights.get(market_regime, 1.0)
            adapted_confidence = min(base_confidence * regime_weight, 1.0)
            
-            # Always return int, float
-            if action is None:
-                return 1, 0.1
-            return int(action), float(adapted_confidence)
+            # Return action, confidence, and probabilities (for orchestrator compatibility)
+            return int(action_idx), float(adapted_confidence), action_probs
+            
+        except Exception as e:
+            logger.error(f"Error in act_with_confidence: {e}")
+            # Return default action with low confidence
+            return 1, 0.1, [0.45, 0.55]  # Default to HOLD action

    def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore):
        """