in the bussiness -but wip
This commit is contained in:
@ -757,20 +757,98 @@ class DQNAgent:
|
||||
# Sanitize and stack states and next_states
|
||||
sanitized_states = []
|
||||
sanitized_next_states = []
|
||||
sanitized_experiences = []
|
||||
|
||||
for i, e in enumerate(experiences):
|
||||
try:
|
||||
state = np.asarray(e[0], dtype=np.float32)
|
||||
next_state = np.asarray(e[3], dtype=np.float32)
|
||||
# Extract experience components
|
||||
state, action, reward, next_state, done = e
|
||||
|
||||
# Sanitize state - convert any dict/object to float arrays
|
||||
state = self._sanitize_state_data(state)
|
||||
next_state = self._sanitize_state_data(next_state)
|
||||
|
||||
# Sanitize action - ensure it's an integer
|
||||
if isinstance(action, dict):
|
||||
# If action is a dict, try to extract action value
|
||||
action = action.get('action', action.get('value', 0))
|
||||
action = int(action) if not isinstance(action, (int, np.integer)) else action
|
||||
|
||||
# Sanitize reward - ensure it's a float
|
||||
if isinstance(reward, dict):
|
||||
# If reward is a dict, try to extract reward value
|
||||
reward = reward.get('reward', reward.get('value', 0.0))
|
||||
reward = float(reward) if not isinstance(reward, (float, np.floating)) else reward
|
||||
|
||||
# Sanitize done - ensure it's a boolean/float
|
||||
if isinstance(done, dict):
|
||||
done = done.get('done', done.get('value', False))
|
||||
done = bool(done) if not isinstance(done, (bool, np.bool_)) else done
|
||||
|
||||
# Convert state to proper numpy array
|
||||
state = np.asarray(state, dtype=np.float32)
|
||||
next_state = np.asarray(next_state, dtype=np.float32)
|
||||
|
||||
# Add to sanitized lists
|
||||
sanitized_states.append(state)
|
||||
sanitized_next_states.append(next_state)
|
||||
sanitized_experiences.append((state, action, reward, next_state, done))
|
||||
|
||||
except Exception as ex:
|
||||
print(f"[DQNAgent] Bad experience at index {i}: {ex}")
|
||||
continue
|
||||
|
||||
if not sanitized_states or not sanitized_next_states:
|
||||
print("[DQNAgent] No valid states in replay batch.")
|
||||
return 0.0 # Return float instead of None for consistency
|
||||
states = torch.FloatTensor(np.stack(sanitized_states)).to(self.device)
|
||||
next_states = torch.FloatTensor(np.stack(sanitized_next_states)).to(self.device)
|
||||
|
||||
# Validate all states have the same dimensions before stacking
|
||||
expected_dim = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
|
||||
if isinstance(expected_dim, tuple):
|
||||
expected_dim = np.prod(expected_dim)
|
||||
|
||||
# Filter out states with wrong dimensions and fix them
|
||||
valid_states = []
|
||||
valid_next_states = []
|
||||
valid_experiences = []
|
||||
|
||||
for i, (state, next_state, exp) in enumerate(zip(sanitized_states, sanitized_next_states, sanitized_experiences)):
|
||||
# Ensure states have correct dimensions
|
||||
if len(state) != expected_dim:
|
||||
logger.debug(f"Fixing state dimension: {len(state)} -> {expected_dim}")
|
||||
if len(state) < expected_dim:
|
||||
# Pad with zeros
|
||||
padded_state = np.zeros(expected_dim, dtype=np.float32)
|
||||
padded_state[:len(state)] = state
|
||||
state = padded_state
|
||||
else:
|
||||
# Truncate
|
||||
state = state[:expected_dim]
|
||||
|
||||
if len(next_state) != expected_dim:
|
||||
logger.debug(f"Fixing next_state dimension: {len(next_state)} -> {expected_dim}")
|
||||
if len(next_state) < expected_dim:
|
||||
# Pad with zeros
|
||||
padded_next_state = np.zeros(expected_dim, dtype=np.float32)
|
||||
padded_next_state[:len(next_state)] = next_state
|
||||
next_state = padded_next_state
|
||||
else:
|
||||
# Truncate
|
||||
next_state = next_state[:expected_dim]
|
||||
|
||||
valid_states.append(state)
|
||||
valid_next_states.append(next_state)
|
||||
valid_experiences.append(exp)
|
||||
|
||||
if not valid_states:
|
||||
print("[DQNAgent] No valid states after dimension fixing.")
|
||||
return 0.0
|
||||
|
||||
# Use validated experiences for training
|
||||
experiences = valid_experiences
|
||||
|
||||
states = torch.FloatTensor(np.stack(valid_states)).to(self.device)
|
||||
next_states = torch.FloatTensor(np.stack(valid_next_states)).to(self.device)
|
||||
|
||||
# Choose appropriate replay method
|
||||
if self.use_mixed_precision:
|
||||
@ -797,28 +875,42 @@ class DQNAgent:
|
||||
extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False)
|
||||
extrema_batch = [self.extrema_memory[i] for i in extrema_indices]
|
||||
|
||||
# Extract tensors from extrema batch
|
||||
extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device)
|
||||
extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device)
|
||||
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device)
|
||||
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device)
|
||||
extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device)
|
||||
# Sanitize extrema batch
|
||||
sanitized_extrema = []
|
||||
for e in extrema_batch:
|
||||
try:
|
||||
state, action, reward, next_state, done = e
|
||||
state = self._sanitize_state_data(state)
|
||||
next_state = self._sanitize_state_data(next_state)
|
||||
state = np.asarray(state, dtype=np.float32)
|
||||
next_state = np.asarray(next_state, dtype=np.float32)
|
||||
sanitized_extrema.append((state, action, reward, next_state, done))
|
||||
except:
|
||||
continue
|
||||
|
||||
# Use a slightly reduced learning rate for extrema training
|
||||
old_lr = self.optimizer.param_groups[0]['lr']
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
|
||||
|
||||
# Train on extrema memory
|
||||
if self.use_mixed_precision:
|
||||
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
|
||||
else:
|
||||
extrema_loss = self._replay_standard(extrema_batch)
|
||||
|
||||
# Reset learning rate
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr
|
||||
|
||||
# Log extrema loss
|
||||
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
|
||||
if sanitized_extrema:
|
||||
# Extract tensors from extrema batch
|
||||
extrema_states = torch.FloatTensor(np.array([e[0] for e in sanitized_extrema])).to(self.device)
|
||||
extrema_actions = torch.LongTensor(np.array([e[1] for e in sanitized_extrema])).to(self.device)
|
||||
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_extrema])).to(self.device)
|
||||
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_extrema])).to(self.device)
|
||||
extrema_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_extrema])).to(self.device)
|
||||
|
||||
# Use a slightly reduced learning rate for extrema training
|
||||
old_lr = self.optimizer.param_groups[0]['lr']
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
|
||||
|
||||
# Train on extrema memory
|
||||
if self.use_mixed_precision:
|
||||
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
|
||||
else:
|
||||
extrema_loss = self._replay_standard(sanitized_extrema)
|
||||
|
||||
# Reset learning rate
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr
|
||||
|
||||
# Log extrema loss
|
||||
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
|
||||
|
||||
# Randomly train on price movement examples (similar to extrema)
|
||||
if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size:
|
||||
@ -826,28 +918,42 @@ class DQNAgent:
|
||||
price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False)
|
||||
price_batch = [self.price_movement_memory[i] for i in price_indices]
|
||||
|
||||
# Extract tensors from price movement batch
|
||||
price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device)
|
||||
price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device)
|
||||
price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device)
|
||||
price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device)
|
||||
price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device)
|
||||
# Sanitize price movement batch
|
||||
sanitized_price = []
|
||||
for e in price_batch:
|
||||
try:
|
||||
state, action, reward, next_state, done = e
|
||||
state = self._sanitize_state_data(state)
|
||||
next_state = self._sanitize_state_data(next_state)
|
||||
state = np.asarray(state, dtype=np.float32)
|
||||
next_state = np.asarray(next_state, dtype=np.float32)
|
||||
sanitized_price.append((state, action, reward, next_state, done))
|
||||
except:
|
||||
continue
|
||||
|
||||
# Use a slightly reduced learning rate for price movement training
|
||||
old_lr = self.optimizer.param_groups[0]['lr']
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
|
||||
|
||||
# Train on price movement memory
|
||||
if self.use_mixed_precision:
|
||||
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
|
||||
else:
|
||||
price_loss = self._replay_standard(price_batch)
|
||||
|
||||
# Reset learning rate
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr
|
||||
|
||||
# Log price movement loss
|
||||
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
|
||||
if sanitized_price:
|
||||
# Extract tensors from price movement batch
|
||||
price_states = torch.FloatTensor(np.array([e[0] for e in sanitized_price])).to(self.device)
|
||||
price_actions = torch.LongTensor(np.array([e[1] for e in sanitized_price])).to(self.device)
|
||||
price_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_price])).to(self.device)
|
||||
price_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_price])).to(self.device)
|
||||
price_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_price])).to(self.device)
|
||||
|
||||
# Use a slightly reduced learning rate for price movement training
|
||||
old_lr = self.optimizer.param_groups[0]['lr']
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
|
||||
|
||||
# Train on price movement memory
|
||||
if self.use_mixed_precision:
|
||||
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
|
||||
else:
|
||||
price_loss = self._replay_standard(sanitized_price)
|
||||
|
||||
# Reset learning rate
|
||||
self.optimizer.param_groups[0]['lr'] = old_lr
|
||||
|
||||
# Log price movement loss
|
||||
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
|
||||
|
||||
return loss
|
||||
|
||||
@ -1452,4 +1558,106 @@ class DQNAgent:
|
||||
total_params = 0
|
||||
for param in self.policy_net.parameters():
|
||||
total_params += param.numel()
|
||||
return total_params
|
||||
return total_params
|
||||
|
||||
def _sanitize_state_data(self, state):
|
||||
"""Sanitize state data to ensure it's a proper numeric array"""
|
||||
try:
|
||||
# If state is already a numpy array, return it
|
||||
if isinstance(state, np.ndarray):
|
||||
# Check for non-numeric data and handle it
|
||||
if state.dtype == object:
|
||||
# Convert object array to float array
|
||||
sanitized = np.zeros_like(state, dtype=np.float32)
|
||||
for i in range(state.shape[0]):
|
||||
if len(state.shape) > 1:
|
||||
for j in range(state.shape[1]):
|
||||
sanitized[i, j] = self._extract_numeric_value(state[i, j])
|
||||
else:
|
||||
sanitized[i] = self._extract_numeric_value(state[i])
|
||||
return sanitized
|
||||
else:
|
||||
return state.astype(np.float32)
|
||||
|
||||
# If state is a list or tuple, convert to array
|
||||
elif isinstance(state, (list, tuple)):
|
||||
# Recursively sanitize each element
|
||||
sanitized = []
|
||||
for item in state:
|
||||
if isinstance(item, (list, tuple)):
|
||||
sanitized_row = []
|
||||
for sub_item in item:
|
||||
sanitized_row.append(self._extract_numeric_value(sub_item))
|
||||
sanitized.append(sanitized_row)
|
||||
else:
|
||||
sanitized.append(self._extract_numeric_value(item))
|
||||
return np.array(sanitized, dtype=np.float32)
|
||||
|
||||
# If state is a dict, try to extract values
|
||||
elif isinstance(state, dict):
|
||||
# Try to extract meaningful values from dict
|
||||
values = []
|
||||
for key in sorted(state.keys()): # Sort for consistency
|
||||
values.append(self._extract_numeric_value(state[key]))
|
||||
return np.array(values, dtype=np.float32)
|
||||
|
||||
# If state is a single value, make it an array
|
||||
else:
|
||||
return np.array([self._extract_numeric_value(state)], dtype=np.float32)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.")
|
||||
# Return a zero array as fallback with the expected state dimension
|
||||
# Use the state_dim from initialization, fallback to 403 if not available
|
||||
expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
|
||||
if isinstance(expected_size, tuple):
|
||||
expected_size = np.prod(expected_size)
|
||||
return np.zeros(int(expected_size), dtype=np.float32)
|
||||
|
||||
def _extract_numeric_value(self, value):
|
||||
"""Extract a numeric value from various data types"""
|
||||
try:
|
||||
# Handle None values
|
||||
if value is None:
|
||||
return 0.0
|
||||
|
||||
# Handle numeric types
|
||||
if isinstance(value, (int, float, np.number)):
|
||||
return float(value)
|
||||
|
||||
# Handle dict values
|
||||
elif isinstance(value, dict):
|
||||
# Try common keys for numeric data
|
||||
for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']:
|
||||
if key in value:
|
||||
return self._extract_numeric_value(value[key])
|
||||
# If no common keys, try to get first numeric value
|
||||
for v in value.values():
|
||||
if isinstance(v, (int, float, np.number)):
|
||||
return float(v)
|
||||
return 0.0
|
||||
|
||||
# Handle string values that might be numeric
|
||||
elif isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
# Handle datetime objects
|
||||
elif hasattr(value, 'timestamp'):
|
||||
return float(value.timestamp())
|
||||
|
||||
# Handle boolean values
|
||||
elif isinstance(value, bool):
|
||||
return float(value)
|
||||
|
||||
# Handle list/tuple - take first numeric value
|
||||
elif isinstance(value, (list, tuple)) and len(value) > 0:
|
||||
return self._extract_numeric_value(value[0])
|
||||
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
except:
|
||||
return 0.0
|
Reference in New Issue
Block a user