in the bussiness -but wip

This commit is contained in:
Dobromir Popov
2025-07-14 12:58:16 +03:00
parent c651ae585a
commit ab232a1262
5 changed files with 693 additions and 381 deletions

View File

@ -757,20 +757,98 @@ class DQNAgent:
# Sanitize and stack states and next_states
sanitized_states = []
sanitized_next_states = []
sanitized_experiences = []
for i, e in enumerate(experiences):
try:
state = np.asarray(e[0], dtype=np.float32)
next_state = np.asarray(e[3], dtype=np.float32)
# Extract experience components
state, action, reward, next_state, done = e
# Sanitize state - convert any dict/object to float arrays
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
# Sanitize action - ensure it's an integer
if isinstance(action, dict):
# If action is a dict, try to extract action value
action = action.get('action', action.get('value', 0))
action = int(action) if not isinstance(action, (int, np.integer)) else action
# Sanitize reward - ensure it's a float
if isinstance(reward, dict):
# If reward is a dict, try to extract reward value
reward = reward.get('reward', reward.get('value', 0.0))
reward = float(reward) if not isinstance(reward, (float, np.floating)) else reward
# Sanitize done - ensure it's a boolean/float
if isinstance(done, dict):
done = done.get('done', done.get('value', False))
done = bool(done) if not isinstance(done, (bool, np.bool_)) else done
# Convert state to proper numpy array
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
# Add to sanitized lists
sanitized_states.append(state)
sanitized_next_states.append(next_state)
sanitized_experiences.append((state, action, reward, next_state, done))
except Exception as ex:
print(f"[DQNAgent] Bad experience at index {i}: {ex}")
continue
if not sanitized_states or not sanitized_next_states:
print("[DQNAgent] No valid states in replay batch.")
return 0.0 # Return float instead of None for consistency
states = torch.FloatTensor(np.stack(sanitized_states)).to(self.device)
next_states = torch.FloatTensor(np.stack(sanitized_next_states)).to(self.device)
# Validate all states have the same dimensions before stacking
expected_dim = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
if isinstance(expected_dim, tuple):
expected_dim = np.prod(expected_dim)
# Filter out states with wrong dimensions and fix them
valid_states = []
valid_next_states = []
valid_experiences = []
for i, (state, next_state, exp) in enumerate(zip(sanitized_states, sanitized_next_states, sanitized_experiences)):
# Ensure states have correct dimensions
if len(state) != expected_dim:
logger.debug(f"Fixing state dimension: {len(state)} -> {expected_dim}")
if len(state) < expected_dim:
# Pad with zeros
padded_state = np.zeros(expected_dim, dtype=np.float32)
padded_state[:len(state)] = state
state = padded_state
else:
# Truncate
state = state[:expected_dim]
if len(next_state) != expected_dim:
logger.debug(f"Fixing next_state dimension: {len(next_state)} -> {expected_dim}")
if len(next_state) < expected_dim:
# Pad with zeros
padded_next_state = np.zeros(expected_dim, dtype=np.float32)
padded_next_state[:len(next_state)] = next_state
next_state = padded_next_state
else:
# Truncate
next_state = next_state[:expected_dim]
valid_states.append(state)
valid_next_states.append(next_state)
valid_experiences.append(exp)
if not valid_states:
print("[DQNAgent] No valid states after dimension fixing.")
return 0.0
# Use validated experiences for training
experiences = valid_experiences
states = torch.FloatTensor(np.stack(valid_states)).to(self.device)
next_states = torch.FloatTensor(np.stack(valid_next_states)).to(self.device)
# Choose appropriate replay method
if self.use_mixed_precision:
@ -797,28 +875,42 @@ class DQNAgent:
extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False)
extrema_batch = [self.extrema_memory[i] for i in extrema_indices]
# Extract tensors from extrema batch
extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device)
extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device)
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device)
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device)
extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device)
# Sanitize extrema batch
sanitized_extrema = []
for e in extrema_batch:
try:
state, action, reward, next_state, done = e
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
sanitized_extrema.append((state, action, reward, next_state, done))
except:
continue
# Use a slightly reduced learning rate for extrema training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
# Train on extrema memory
if self.use_mixed_precision:
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
else:
extrema_loss = self._replay_standard(extrema_batch)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log extrema loss
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
if sanitized_extrema:
# Extract tensors from extrema batch
extrema_states = torch.FloatTensor(np.array([e[0] for e in sanitized_extrema])).to(self.device)
extrema_actions = torch.LongTensor(np.array([e[1] for e in sanitized_extrema])).to(self.device)
extrema_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_extrema])).to(self.device)
extrema_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_extrema])).to(self.device)
extrema_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_extrema])).to(self.device)
# Use a slightly reduced learning rate for extrema training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.8
# Train on extrema memory
if self.use_mixed_precision:
extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones)
else:
extrema_loss = self._replay_standard(sanitized_extrema)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log extrema loss
logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}")
# Randomly train on price movement examples (similar to extrema)
if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size:
@ -826,28 +918,42 @@ class DQNAgent:
price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False)
price_batch = [self.price_movement_memory[i] for i in price_indices]
# Extract tensors from price movement batch
price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device)
price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device)
price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device)
price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device)
price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device)
# Sanitize price movement batch
sanitized_price = []
for e in price_batch:
try:
state, action, reward, next_state, done = e
state = self._sanitize_state_data(state)
next_state = self._sanitize_state_data(next_state)
state = np.asarray(state, dtype=np.float32)
next_state = np.asarray(next_state, dtype=np.float32)
sanitized_price.append((state, action, reward, next_state, done))
except:
continue
# Use a slightly reduced learning rate for price movement training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
# Train on price movement memory
if self.use_mixed_precision:
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
else:
price_loss = self._replay_standard(price_batch)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log price movement loss
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
if sanitized_price:
# Extract tensors from price movement batch
price_states = torch.FloatTensor(np.array([e[0] for e in sanitized_price])).to(self.device)
price_actions = torch.LongTensor(np.array([e[1] for e in sanitized_price])).to(self.device)
price_rewards = torch.FloatTensor(np.array([e[2] for e in sanitized_price])).to(self.device)
price_next_states = torch.FloatTensor(np.array([e[3] for e in sanitized_price])).to(self.device)
price_dones = torch.FloatTensor(np.array([e[4] for e in sanitized_price])).to(self.device)
# Use a slightly reduced learning rate for price movement training
old_lr = self.optimizer.param_groups[0]['lr']
self.optimizer.param_groups[0]['lr'] = old_lr * 0.75
# Train on price movement memory
if self.use_mixed_precision:
price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones)
else:
price_loss = self._replay_standard(sanitized_price)
# Reset learning rate
self.optimizer.param_groups[0]['lr'] = old_lr
# Log price movement loss
logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}")
return loss
@ -1452,4 +1558,106 @@ class DQNAgent:
total_params = 0
for param in self.policy_net.parameters():
total_params += param.numel()
return total_params
return total_params
def _sanitize_state_data(self, state):
"""Sanitize state data to ensure it's a proper numeric array"""
try:
# If state is already a numpy array, return it
if isinstance(state, np.ndarray):
# Check for non-numeric data and handle it
if state.dtype == object:
# Convert object array to float array
sanitized = np.zeros_like(state, dtype=np.float32)
for i in range(state.shape[0]):
if len(state.shape) > 1:
for j in range(state.shape[1]):
sanitized[i, j] = self._extract_numeric_value(state[i, j])
else:
sanitized[i] = self._extract_numeric_value(state[i])
return sanitized
else:
return state.astype(np.float32)
# If state is a list or tuple, convert to array
elif isinstance(state, (list, tuple)):
# Recursively sanitize each element
sanitized = []
for item in state:
if isinstance(item, (list, tuple)):
sanitized_row = []
for sub_item in item:
sanitized_row.append(self._extract_numeric_value(sub_item))
sanitized.append(sanitized_row)
else:
sanitized.append(self._extract_numeric_value(item))
return np.array(sanitized, dtype=np.float32)
# If state is a dict, try to extract values
elif isinstance(state, dict):
# Try to extract meaningful values from dict
values = []
for key in sorted(state.keys()): # Sort for consistency
values.append(self._extract_numeric_value(state[key]))
return np.array(values, dtype=np.float32)
# If state is a single value, make it an array
else:
return np.array([self._extract_numeric_value(state)], dtype=np.float32)
except Exception as e:
logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.")
# Return a zero array as fallback with the expected state dimension
# Use the state_dim from initialization, fallback to 403 if not available
expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
if isinstance(expected_size, tuple):
expected_size = np.prod(expected_size)
return np.zeros(int(expected_size), dtype=np.float32)
def _extract_numeric_value(self, value):
"""Extract a numeric value from various data types"""
try:
# Handle None values
if value is None:
return 0.0
# Handle numeric types
if isinstance(value, (int, float, np.number)):
return float(value)
# Handle dict values
elif isinstance(value, dict):
# Try common keys for numeric data
for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']:
if key in value:
return self._extract_numeric_value(value[key])
# If no common keys, try to get first numeric value
for v in value.values():
if isinstance(v, (int, float, np.number)):
return float(v)
return 0.0
# Handle string values that might be numeric
elif isinstance(value, str):
try:
return float(value)
except:
return 0.0
# Handle datetime objects
elif hasattr(value, 'timestamp'):
return float(value.timestamp())
# Handle boolean values
elif isinstance(value, bool):
return float(value)
# Handle list/tuple - take first numeric value
elif isinstance(value, (list, tuple)) and len(value) > 0:
return self._extract_numeric_value(value[0])
else:
return 0.0
except:
return 0.0