in the bussiness -but wip

2025-07-14 12:58:16 +03:00
parent c651ae585a
commit ab232a1262
5 changed files with 693 additions and 381 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -757,20 +757,98 @@ class DQNAgent:
        # Sanitize and stack states and next_states
        sanitized_states = []
        sanitized_next_states = []
+        sanitized_experiences = []
+        
        for i, e in enumerate(experiences):
            try:
-                state = np.asarray(e[0], dtype=np.float32)
-                next_state = np.asarray(e[3], dtype=np.float32)
+                # Extract experience components
+                state, action, reward, next_state, done = e
+                
+                # Sanitize state - convert any dict/object to float arrays
+                state = self._sanitize_state_data(state)
+                next_state = self._sanitize_state_data(next_state)
+                
+                # Sanitize action - ensure it's an integer
+                if isinstance(action, dict):
+                    # If action is a dict, try to extract action value
+                    action = action.get('action', action.get('value', 0))
+                action = int(action) if not isinstance(action, (int, np.integer)) else action
+                
+                # Sanitize reward - ensure it's a float
+                if isinstance(reward, dict):
+                    # If reward is a dict, try to extract reward value
+                    reward = reward.get('reward', reward.get('value', 0.0))
+                reward = float(reward) if not isinstance(reward, (float, np.floating)) else reward
+                
+                # Sanitize done - ensure it's a boolean/float
+                if isinstance(done, dict):
+                    done = done.get('done', done.get('value', False))
+                done = bool(done) if not isinstance(done, (bool, np.bool_)) else done
+                
+                # Convert state to proper numpy array
+                state = np.asarray(state, dtype=np.float32)
+                next_state = np.asarray(next_state, dtype=np.float32)
+                
+                # Add to sanitized lists
                sanitized_states.append(state)
                sanitized_next_states.append(next_state)
+                sanitized_experiences.append((state, action, reward, next_state, done))
+                
            except Exception as ex:
                print(f"[DQNAgent] Bad experience at index {i}: {ex}")
                continue
+                
        if not sanitized_states or not sanitized_next_states:
            print("[DQNAgent] No valid states in replay batch.")
            return 0.0  # Return float instead of None for consistency
-        states = torch.FloatTensor(np.stack(sanitized_states)).to(self.device)
-        next_states = torch.FloatTensor(np.stack(sanitized_next_states)).to(self.device)
+        
+        # Validate all states have the same dimensions before stacking
+        expected_dim = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
+        if isinstance(expected_dim, tuple):
+            expected_dim = np.prod(expected_dim)
+        
+        # Filter out states with wrong dimensions and fix them
+        valid_states = []
+        valid_next_states = []
+        valid_experiences = []
+        
+        for i, (state, next_state, exp) in enumerate(zip(sanitized_states, sanitized_next_states, sanitized_experiences)):
+            # Ensure states have correct dimensions
+            if len(state) != expected_dim:
+                logger.debug(f"Fixing state dimension: {len(state)} -> {expected_dim}")
+                if len(state) < expected_dim:
+                    # Pad with zeros
+                    padded_state = np.zeros(expected_dim, dtype=np.float32)
+                    padded_state[:len(state)] = state
+                    state = padded_state
+                else:
+                    # Truncate
+                    state = state[:expected_dim]
+            
+            if len(next_state) != expected_dim:
+                logger.debug(f"Fixing next_state dimension: {len(next_state)} -> {expected_dim}")
+                if len(next_state) < expected_dim:
+                    # Pad with zeros
+                    padded_next_state = np.zeros(expected_dim, dtype=np.float32)
+                    padded_next_state[:len(next_state)] = next_state
+                    next_state = padded_next_state
+                else:
+                    # Truncate
+                    next_state = next_state[:expected_dim]
+            
+            valid_states.append(state)
+            valid_next_states.append(next_state)
+            valid_experiences.append(exp)
+        
+        if not valid_states:
+            print("[DQNAgent] No valid states after dimension fixing.")
+            return 0.0
+            
+        # Use validated experiences for training
+        experiences = valid_experiences
+        
+        states = torch.FloatTensor(np.stack(valid_states)).to(self.device)
+        next_states = torch.FloatTensor(np.stack(valid_next_states)).to(self.device)
        
        # Choose appropriate replay method
        if self.use_mixed_precision:
@@ -797,28 +875,42 @@ class DQNAgent:
            extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False)
            extrema_batch = [self.extrema_memory[i] for i in extrema_indices]
            
-            # Extract tensors from extrema batch
-            extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device)
-            extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device)
-            extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device)
-            extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device)
-            extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device)
+            # Sanitize extrema batch
+            sanitized_extrema = []
+            for e in extrema_batch:
+                try:
+                    state, action, reward, next_state, done = e
+                    state = self._sanitize_state_data(state)
+                    next_state = self._sanitize_state_data(next_state)
+                    state = np.asarray(state, dtype=np.float32)
+                    next_state = np.asarray(next_state, dtype=np.float32)
+                    sanitized_extrema.append((state, action, reward, next_state, done))
+                except:
+                    continue
            
-            # Use a slightly reduced learning rate for extrema training
-            old_lr = self.optimizer.param_groups[0]['lr']
-            self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
-            
-            # Train on extrema memory
-            if self.use_mixed_precision:
-                extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
-            else:
-                extrema_loss = self._replay_standard(extrema_batch)
-            
-            # Reset learning rate
-            self.optimizer.param_groups[0]['lr'] = old_lr
-            
-            # Log extrema loss 
-            logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
+            if sanitized_extrema:
+                # Extract tensors from extrema batch
+                extrema_states = torch.FloatTensor(np.array([e[0] for e in sanitized_extrema])).to(self.device)
+                extrema_actions = torch.LongTensor(np.array([e[1] for e in sanitized_extrema])).to(self.device)
+                extrema_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_extrema])).to(self.device)
+                extrema_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_extrema])).to(self.device)
+                extrema_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_extrema])).to(self.device)
+                
+                # Use a slightly reduced learning rate for extrema training
+                old_lr = self.optimizer.param_groups[0]['lr']
+                self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
+                
+                # Train on extrema memory
+                if self.use_mixed_precision:
+                    extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
+                else:
+                    extrema_loss = self._replay_standard(sanitized_extrema)
+                
+                # Reset learning rate
+                self.optimizer.param_groups[0]['lr'] = old_lr
+                
+                # Log extrema loss 
+                logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
        
        # Randomly train on price movement examples (similar to extrema)
        if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size:
@@ -826,28 +918,42 @@ class DQNAgent:
            price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False)
            price_batch = [self.price_movement_memory[i] for i in price_indices]
            
-            # Extract tensors from price movement batch
-            price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device)
-            price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device)
-            price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device)
-            price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device)
-            price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device)
+            # Sanitize price movement batch
+            sanitized_price = []
+            for e in price_batch:
+                try:
+                    state, action, reward, next_state, done = e
+                    state = self._sanitize_state_data(state)
+                    next_state = self._sanitize_state_data(next_state)
+                    state = np.asarray(state, dtype=np.float32)
+                    next_state = np.asarray(next_state, dtype=np.float32)
+                    sanitized_price.append((state, action, reward, next_state, done))
+                except:
+                    continue
            
-            # Use a slightly reduced learning rate for price movement training
-            old_lr = self.optimizer.param_groups[0]['lr']
-            self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
-            
-            # Train on price movement memory
-            if self.use_mixed_precision:
-                price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
-            else:
-                price_loss = self._replay_standard(price_batch)
-            
-            # Reset learning rate
-            self.optimizer.param_groups[0]['lr'] = old_lr
-            
-            # Log price movement loss 
-            logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
+            if sanitized_price:
+                # Extract tensors from price movement batch
+                price_states = torch.FloatTensor(np.array([e[0] for e in sanitized_price])).to(self.device)
+                price_actions = torch.LongTensor(np.array([e[1] for e in sanitized_price])).to(self.device)
+                price_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_price])).to(self.device)
+                price_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_price])).to(self.device)
+                price_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_price])).to(self.device)
+                
+                # Use a slightly reduced learning rate for price movement training
+                old_lr = self.optimizer.param_groups[0]['lr']
+                self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
+                
+                # Train on price movement memory
+                if self.use_mixed_precision:
+                    price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
+                else:
+                    price_loss = self._replay_standard(sanitized_price)
+                
+                # Reset learning rate
+                self.optimizer.param_groups[0]['lr'] = old_lr
+                
+                # Log price movement loss 
+                logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
        
        return loss

@@ -1452,4 +1558,106 @@ class DQNAgent:
        total_params = 0
        for param in self.policy_net.parameters():
            total_params += param.numel()
-        return total_params 
+        return total_params 
+
+    def _sanitize_state_data(self, state):
+        """Sanitize state data to ensure it's a proper numeric array"""
+        try:
+            # If state is already a numpy array, return it
+            if isinstance(state, np.ndarray):
+                # Check for non-numeric data and handle it
+                if state.dtype == object:
+                    # Convert object array to float array
+                    sanitized = np.zeros_like(state, dtype=np.float32)
+                    for i in range(state.shape[0]):
+                        if len(state.shape) > 1:
+                            for j in range(state.shape[1]):
+                                sanitized[i, j] = self._extract_numeric_value(state[i, j])
+                        else:
+                            sanitized[i] = self._extract_numeric_value(state[i])
+                    return sanitized
+                else:
+                    return state.astype(np.float32)
+            
+            # If state is a list or tuple, convert to array
+            elif isinstance(state, (list, tuple)):
+                # Recursively sanitize each element
+                sanitized = []
+                for item in state:
+                    if isinstance(item, (list, tuple)):
+                        sanitized_row = []
+                        for sub_item in item:
+                            sanitized_row.append(self._extract_numeric_value(sub_item))
+                        sanitized.append(sanitized_row)
+                    else:
+                        sanitized.append(self._extract_numeric_value(item))
+                return np.array(sanitized, dtype=np.float32)
+            
+            # If state is a dict, try to extract values
+            elif isinstance(state, dict):
+                # Try to extract meaningful values from dict
+                values = []
+                for key in sorted(state.keys()):  # Sort for consistency
+                    values.append(self._extract_numeric_value(state[key]))
+                return np.array(values, dtype=np.float32)
+            
+            # If state is a single value, make it an array
+            else:
+                return np.array([self._extract_numeric_value(state)], dtype=np.float32)
+                
+        except Exception as e:
+            logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.")
+            # Return a zero array as fallback with the expected state dimension
+            # Use the state_dim from initialization, fallback to 403 if not available
+            expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
+            if isinstance(expected_size, tuple):
+                expected_size = np.prod(expected_size)
+            return np.zeros(int(expected_size), dtype=np.float32)
+    
+    def _extract_numeric_value(self, value):
+        """Extract a numeric value from various data types"""
+        try:
+            # Handle None values
+            if value is None:
+                return 0.0
+            
+            # Handle numeric types
+            if isinstance(value, (int, float, np.number)):
+                return float(value)
+            
+            # Handle dict values
+            elif isinstance(value, dict):
+                # Try common keys for numeric data
+                for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']:
+                    if key in value:
+                        return self._extract_numeric_value(value[key])
+                # If no common keys, try to get first numeric value
+                for v in value.values():
+                    if isinstance(v, (int, float, np.number)):
+                        return float(v)
+                return 0.0
+            
+            # Handle string values that might be numeric
+            elif isinstance(value, str):
+                try:
+                    return float(value)
+                except:
+                    return 0.0
+            
+            # Handle datetime objects
+            elif hasattr(value, 'timestamp'):
+                return float(value.timestamp())
+            
+            # Handle boolean values
+            elif isinstance(value, bool):
+                return float(value)
+            
+            # Handle list/tuple - take first numeric value
+            elif isinstance(value, (list, tuple)) and len(value) > 0:
+                return self._extract_numeric_value(value[0])
+            
+            else:
+                return 0.0
+                
+        except:
+            return 0.0