new training process and changes to the models (wip)

2025-04-01 18:43:26 +03:00
parent a78906a888
commit 902593b5f3
6 changed files with 5151 additions and 2635 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -28,14 +28,14 @@ class DQNAgent:
                 window_size: int,
                 num_features: int,
                 timeframes: List[str],
-                 learning_rate: float = 0.001,
-                 gamma: float = 0.99,
+                 learning_rate: float = 0.0005,  # Reduced learning rate for more stability
+                 gamma: float = 0.97,            # Slightly reduced discount factor
                 epsilon: float = 1.0,
-                 epsilon_min: float = 0.01,
-                 epsilon_decay: float = 0.995,
-                 memory_size: int = 10000,
-                 batch_size: int = 64,
-                 target_update: int = 10):
+                 epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
+                 epsilon_decay: float = 0.9975,  # Slower decay rate
+                 memory_size: int = 20000,       # Increased memory size
+                 batch_size: int = 128,          # Larger batch size
+                 target_update: int = 5):        # More frequent target updates
        
        self.state_size = state_size
        self.action_size = action_size
@@ -70,23 +70,25 @@ class DQNAgent:
        ).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
-        # Initialize optimizer
-        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
+        # Initialize optimizer with gradient clipping
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
        
-        # Initialize memory
+        # Initialize memories with different priorities
        self.memory = deque(maxlen=memory_size)
-        
-        # Special memory for extrema samples to use for targeted learning
-        self.extrema_memory = deque(maxlen=memory_size // 5)  # Smaller size for extrema examples
+        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
+        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards
        
        # Training metrics
        self.update_count = 0
        self.losses = []
+        self.avg_reward = 0
+        self.no_improvement_count = 0
+        self.best_reward = float('-inf')
        
    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
-        Store experience in memory
+        Store experience in memory with prioritization
        
        Args:
            state: Current state
@@ -97,28 +99,124 @@ class DQNAgent:
            is_extrema: Whether this is a local extrema sample (for specialized learning)
        """
        experience = (state, action, reward, next_state, done)
+        
+        # Always add to main memory
        self.memory.append(experience)
        
-        # If this is an extrema sample, also add to specialized memory
+        # Add to specialized memories if applicable
        if is_extrema:
            self.extrema_memory.append(experience)
+            
+        # Store positive experiences separately for prioritized replay
+        if reward > 0:
+            self.positive_memory.append(experience)
    
-    def act(self, state: np.ndarray) -> int:
-        """Choose action using epsilon-greedy policy"""
-        if random.random() < self.epsilon:
+    def act(self, state: np.ndarray, explore=True) -> int:
+        """Choose action using epsilon-greedy policy with explore flag"""
+        if explore and random.random() < self.epsilon:
            return random.randrange(self.action_size)
        
        with torch.no_grad():
-            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
-            action_probs, extrema_pred = self.policy_net(state)
+            # Ensure state is normalized before inference
+            state_tensor = self._normalize_state(state)
+            state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
+            action_probs, extrema_pred = self.policy_net(state_tensor)
            return action_probs.argmax().item()
    
-    def replay(self, use_extrema=False) -> float:
+    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
+        """Normalize the state data to prevent numerical issues"""
+        # Handle NaN and infinite values
+        state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
+        
+        # Check if state is 1D array (happens in some environments)
+        if len(state.shape) == 1:
+            # If 1D, we need to normalize the whole array
+            normalized_state = state.copy()
+            
+            # Convert any timestamp or non-numeric data to float
+            for i in range(len(normalized_state)):
+                # Check for timestamp-like objects
+                if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
+                    # Convert timestamp to float (seconds since epoch)
+                    normalized_state[i] = float(normalized_state[i].timestamp())
+                elif not isinstance(normalized_state[i], (int, float, np.number)):
+                    # Set non-numeric data to 0
+                    normalized_state[i] = 0.0
+            
+            # Ensure all values are float
+            normalized_state = normalized_state.astype(np.float32)
+            
+            # Simple min-max normalization for 1D state
+            state_min = np.min(normalized_state)
+            state_max = np.max(normalized_state)
+            if state_max > state_min:
+                normalized_state = (normalized_state - state_min) / (state_max - state_min)
+            return normalized_state
+        
+        # Handle 2D arrays
+        normalized_state = np.zeros_like(state, dtype=np.float32)
+        
+        # Convert any timestamp or non-numeric data to float
+        for i in range(state.shape[0]):
+            for j in range(state.shape[1]):
+                if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
+                    # Convert timestamp to float (seconds since epoch)
+                    normalized_state[i, j] = float(state[i, j].timestamp())
+                elif isinstance(state[i, j], (int, float, np.number)):
+                    normalized_state[i, j] = state[i, j]
+                else:
+                    # Set non-numeric data to 0
+                    normalized_state[i, j] = 0.0
+        
+        # Loop through each timeframe's features in the combined state
+        feature_count = state.shape[1] // len(self.timeframes)
+        
+        for tf_idx in range(len(self.timeframes)):
+            start_idx = tf_idx * feature_count
+            end_idx = start_idx + feature_count
+            
+            # Extract this timeframe's features
+            tf_features = normalized_state[:, start_idx:end_idx]
+            
+            # Normalize OHLCV data by the first close price in the window
+            # This makes price movements relative rather than absolute
+            price_idx = 3  # Assuming close price is at index 3
+            if price_idx < tf_features.shape[1]:
+                reference_price = np.mean(tf_features[:, price_idx])
+                if reference_price != 0:
+                    # Normalize price-related columns (OHLC)
+                    for i in range(4):  # First 4 columns are OHLC
+                        if i < tf_features.shape[1]:
+                            normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price
+                
+                # Normalize volume using mean and std
+                vol_idx = 4  # Assuming volume is at index 4
+                if vol_idx < tf_features.shape[1]:
+                    vol_mean = np.mean(tf_features[:, vol_idx])
+                    vol_std = np.std(tf_features[:, vol_idx])
+                    if vol_std > 0:
+                        normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
+                    else:
+                        normalized_state[:, start_idx + vol_idx] = 0
+                
+                # Other features (technical indicators) - normalize with min-max scaling
+                for i in range(5, feature_count):
+                    if i < tf_features.shape[1]:
+                        feature_min = np.min(tf_features[:, i])
+                        feature_max = np.max(tf_features[:, i])
+                        if feature_max > feature_min:
+                            normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
+                        else:
+                            normalized_state[:, start_idx + i] = 0
+        
+        return normalized_state
+    
+    def replay(self, use_prioritized=True) -> float:
        """
-        Train on a batch of experiences
+        Train on a batch of experiences with prioritized sampling
        
        Args:
-            use_extrema: Whether to include extrema samples in training
+            use_prioritized: Whether to use prioritized replay
            
        Returns:
            float: Loss value
@@ -126,55 +224,67 @@ class DQNAgent:
        if len(self.memory) < self.batch_size:
            return 0.0
        
-        # Sample batch - mix regular and extrema samples
+        # Sample batch with prioritization
        batch = []
-        if use_extrema and len(self.extrema_memory) > self.batch_size // 4:
-            # Get some extrema samples
-            extrema_count = min(self.batch_size // 3, len(self.extrema_memory))
-            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
+        
+        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
+            # Prioritized sampling from different memory types
+            positive_count = min(self.batch_size // 4, len(self.positive_memory))
+            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
+            regular_count = self.batch_size - positive_count - extrema_count
            
-            # Get regular samples for the rest
-            regular_count = self.batch_size - extrema_count
+            positive_samples = random.sample(list(self.positive_memory), positive_count)
+            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
            regular_samples = random.sample(list(self.memory), regular_count)
            
-            # Combine samples
-            batch = extrema_samples + regular_samples
+            batch = positive_samples + extrema_samples + regular_samples
        else:
            # Standard sampling
            batch = random.sample(self.memory, self.batch_size)
        
        states, actions, rewards, next_states, dones = zip(*batch)
        
+        # Normalize states before training
+        normalized_states = np.array([self._normalize_state(state) for state in states])
+        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
+        
        # Convert to tensors and move to device
-        states = torch.FloatTensor(np.array(states)).to(self.device)
-        actions = torch.LongTensor(actions).to(self.device)
-        rewards = torch.FloatTensor(rewards).to(self.device)
-        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
-        dones = torch.FloatTensor(dones).to(self.device)
+        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
        
        # Get current Q values
-        current_q_values, extrema_pred = self.policy_net(states)
-        current_q_values = current_q_values.gather(1, actions.unsqueeze(1))
+        current_q_values, extrema_pred = self.policy_net(states_tensor)
+        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
        
-        # Get next Q values from target network
+        # Get next Q values from target network (Double DQN approach)
        with torch.no_grad():
-            next_q_values, _ = self.target_net(next_states)
-            next_q_values = next_q_values.max(1)[0]
-            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+            # Get actions from policy network
+            next_actions, _ = self.policy_net(next_states_tensor)
+            next_actions = next_actions.max(1)[1].unsqueeze(1)
+            
+            # Get Q values from target network for those actions
+            next_q_values, _ = self.target_net(next_states_tensor)
+            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
+            
+            # Compute target Q values
+            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
        
-        # Compute Q-learning loss
-        q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
+        # Clamp target values to prevent extreme values
+        target_q_values = torch.clamp(target_q_values, -100, 100)
        
-        # If we have extrema labels (not in this implementation yet), 
-        # we could add an additional loss for extrema prediction
-        # This would require labels for whether each state is near an extrema
-        
-        # Total loss is just Q-learning loss for now
-        loss = q_loss
+        # Compute Huber loss (more robust to outliers than MSE)
+        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
+        
+        # Apply gradient clipping to prevent exploding gradients
+        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
+        
        self.optimizer.step()
        
        # Update target network if needed
@@ -200,37 +310,77 @@ class DQNAgent:
        """
        if len(states) == 0:
            return 0.0
+        
+        # Normalize states
+        normalized_states = np.array([self._normalize_state(state) for state in states])
+        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
            
        # Convert to tensors
-        states = torch.FloatTensor(np.array(states)).to(self.device)
-        actions = torch.LongTensor(actions).to(self.device)
-        rewards = torch.FloatTensor(rewards).to(self.device)
-        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
-        dones = torch.FloatTensor(dones).to(self.device)
+        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
        
        # Forward pass
-        current_q_values, extrema_pred = self.policy_net(states)
-        current_q_values = current_q_values.gather(1, actions.unsqueeze(1))
+        current_q_values, extrema_pred = self.policy_net(states_tensor)
+        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
        
-        # Get next Q values
+        # Get next Q values (Double DQN approach)
        with torch.no_grad():
-            next_q_values, _ = self.target_net(next_states)
-            next_q_values = next_q_values.max(1)[0]
-            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+            next_actions, _ = self.policy_net(next_states_tensor)
+            next_actions = next_actions.max(1)[1].unsqueeze(1)
            
-        # Higher weight for extrema training
-        q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
+            next_q_values, _ = self.target_net(next_states_tensor)
+            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
+            
+            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
+            
+        # Clamp target values
+        target_q_values = torch.clamp(target_q_values, -100, 100)
        
-        # Full loss is just Q-learning loss
+        # Use Huber loss for extrema training
+        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
+        
+        # Full loss
        loss = q_loss
        
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
+        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        
        return loss.item()
    
+    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
+        """Update learning metrics and perform learning rate adjustments if needed"""
+        # Update average reward with exponential moving average
+        if self.avg_reward == 0:
+            self.avg_reward = episode_reward
+        else:
+            self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward
+        
+        # Check if we're making sufficient progress
+        if episode_reward > (1 + best_reward_threshold) * self.best_reward:
+            self.best_reward = episode_reward
+            self.no_improvement_count = 0
+            return True  # Improved
+        else:
+            self.no_improvement_count += 1
+            
+            # If no improvement for a while, adjust learning rate
+            if self.no_improvement_count >= 10:
+                current_lr = self.optimizer.param_groups[0]['lr']
+                new_lr = current_lr * 0.5
+                if new_lr >= 1e-6:  # Don't reduce below minimum threshold
+                    for param_group in self.optimizer.param_groups:
+                        param_group['lr'] = new_lr
+                    logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
+                    self.no_improvement_count = 0
+            
+            return False  # No improvement
+    
    def save(self, path: str):
        """Save model and agent state"""
        os.makedirs(os.path.dirname(path), exist_ok=True)
@@ -246,9 +396,13 @@ class DQNAgent:
            'epsilon': self.epsilon,
            'update_count': self.update_count,
            'losses': self.losses,
-            'optimizer_state': self.optimizer.state_dict()
+            'optimizer_state': self.optimizer.state_dict(),
+            'best_reward': self.best_reward,
+            'avg_reward': self.avg_reward
        }
+        
        torch.save(state, f"{path}_agent_state.pt")
+        logger.info(f"Agent state saved to {path}_agent_state.pt")
    
    def load(self, path: str):
        """Load model and agent state"""
@@ -259,8 +413,19 @@ class DQNAgent:
        self.target_net.load(f"{path}_target")
        
        # Load agent state
-        state = torch.load(f"{path}_agent_state.pt")
-        self.epsilon = state['epsilon']
-        self.update_count = state['update_count']
-        self.losses = state['losses']
-        self.optimizer.load_state_dict(state['optimizer_state']) 
+        try:
+            agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
+            self.epsilon = agent_state['epsilon']
+            self.update_count = agent_state['update_count']
+            self.losses = agent_state['losses']
+            self.optimizer.load_state_dict(agent_state['optimizer_state'])
+            
+            # Load additional metrics if they exist
+            if 'best_reward' in agent_state:
+                self.best_reward = agent_state['best_reward']
+            if 'avg_reward' in agent_state:
+                self.avg_reward = agent_state['avg_reward']
+                
+            logger.info(f"Agent state loaded from {path}_agent_state.pt")
+        except FileNotFoundError:
+            logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")