431 lines
18 KiB
Python
431 lines
18 KiB
Python
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
import numpy as np
|
|
from collections import deque
|
|
import random
|
|
from typing import Tuple, List
|
|
import os
|
|
import sys
|
|
import logging
|
|
|
|
# Add parent directory to path
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
|
|
from NN.models.simple_cnn import CNNModelPyTorch
|
|
|
|
# Configure logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DQNAgent:
|
|
"""
|
|
Deep Q-Network agent for trading
|
|
Uses CNN model as the base network
|
|
"""
|
|
def __init__(self,
|
|
state_size: int,
|
|
action_size: int,
|
|
window_size: int,
|
|
num_features: int,
|
|
timeframes: List[str],
|
|
learning_rate: float = 0.0005, # Reduced learning rate for more stability
|
|
gamma: float = 0.97, # Slightly reduced discount factor
|
|
epsilon: float = 1.0,
|
|
epsilon_min: float = 0.05, # Increased minimum epsilon for more exploration
|
|
epsilon_decay: float = 0.9975, # Slower decay rate
|
|
memory_size: int = 20000, # Increased memory size
|
|
batch_size: int = 128, # Larger batch size
|
|
target_update: int = 5): # More frequent target updates
|
|
|
|
self.state_size = state_size
|
|
self.action_size = action_size
|
|
self.window_size = window_size
|
|
self.num_features = num_features
|
|
self.timeframes = timeframes
|
|
self.learning_rate = learning_rate
|
|
self.gamma = gamma
|
|
self.epsilon = epsilon
|
|
self.epsilon_min = epsilon_min
|
|
self.epsilon_decay = epsilon_decay
|
|
self.memory_size = memory_size
|
|
self.batch_size = batch_size
|
|
self.target_update = target_update
|
|
|
|
# Device configuration
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
# Initialize networks
|
|
self.policy_net = CNNModelPyTorch(
|
|
window_size=window_size,
|
|
num_features=num_features,
|
|
output_size=action_size,
|
|
timeframes=timeframes
|
|
).to(self.device)
|
|
|
|
self.target_net = CNNModelPyTorch(
|
|
window_size=window_size,
|
|
num_features=num_features,
|
|
output_size=action_size,
|
|
timeframes=timeframes
|
|
).to(self.device)
|
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
|
|
|
# Initialize optimizer with gradient clipping
|
|
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
|
|
|
|
# Initialize memories with different priorities
|
|
self.memory = deque(maxlen=memory_size)
|
|
self.extrema_memory = deque(maxlen=memory_size // 4) # For extrema points
|
|
self.positive_memory = deque(maxlen=memory_size // 4) # For positive rewards
|
|
|
|
# Training metrics
|
|
self.update_count = 0
|
|
self.losses = []
|
|
self.avg_reward = 0
|
|
self.no_improvement_count = 0
|
|
self.best_reward = float('-inf')
|
|
|
|
def remember(self, state: np.ndarray, action: int, reward: float,
|
|
next_state: np.ndarray, done: bool, is_extrema: bool = False):
|
|
"""
|
|
Store experience in memory with prioritization
|
|
|
|
Args:
|
|
state: Current state
|
|
action: Action taken
|
|
reward: Reward received
|
|
next_state: Next state
|
|
done: Whether episode is done
|
|
is_extrema: Whether this is a local extrema sample (for specialized learning)
|
|
"""
|
|
experience = (state, action, reward, next_state, done)
|
|
|
|
# Always add to main memory
|
|
self.memory.append(experience)
|
|
|
|
# Add to specialized memories if applicable
|
|
if is_extrema:
|
|
self.extrema_memory.append(experience)
|
|
|
|
# Store positive experiences separately for prioritized replay
|
|
if reward > 0:
|
|
self.positive_memory.append(experience)
|
|
|
|
def act(self, state: np.ndarray, explore=True) -> int:
|
|
"""Choose action using epsilon-greedy policy with explore flag"""
|
|
if explore and random.random() < self.epsilon:
|
|
return random.randrange(self.action_size)
|
|
|
|
with torch.no_grad():
|
|
# Ensure state is normalized before inference
|
|
state_tensor = self._normalize_state(state)
|
|
state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
|
|
action_probs, extrema_pred = self.policy_net(state_tensor)
|
|
return action_probs.argmax().item()
|
|
|
|
def _normalize_state(self, state: np.ndarray) -> np.ndarray:
|
|
"""Normalize the state data to prevent numerical issues"""
|
|
# Handle NaN and infinite values
|
|
state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
|
|
|
|
# Check if state is 1D array (happens in some environments)
|
|
if len(state.shape) == 1:
|
|
# If 1D, we need to normalize the whole array
|
|
normalized_state = state.copy()
|
|
|
|
# Convert any timestamp or non-numeric data to float
|
|
for i in range(len(normalized_state)):
|
|
# Check for timestamp-like objects
|
|
if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
|
|
# Convert timestamp to float (seconds since epoch)
|
|
normalized_state[i] = float(normalized_state[i].timestamp())
|
|
elif not isinstance(normalized_state[i], (int, float, np.number)):
|
|
# Set non-numeric data to 0
|
|
normalized_state[i] = 0.0
|
|
|
|
# Ensure all values are float
|
|
normalized_state = normalized_state.astype(np.float32)
|
|
|
|
# Simple min-max normalization for 1D state
|
|
state_min = np.min(normalized_state)
|
|
state_max = np.max(normalized_state)
|
|
if state_max > state_min:
|
|
normalized_state = (normalized_state - state_min) / (state_max - state_min)
|
|
return normalized_state
|
|
|
|
# Handle 2D arrays
|
|
normalized_state = np.zeros_like(state, dtype=np.float32)
|
|
|
|
# Convert any timestamp or non-numeric data to float
|
|
for i in range(state.shape[0]):
|
|
for j in range(state.shape[1]):
|
|
if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
|
|
# Convert timestamp to float (seconds since epoch)
|
|
normalized_state[i, j] = float(state[i, j].timestamp())
|
|
elif isinstance(state[i, j], (int, float, np.number)):
|
|
normalized_state[i, j] = state[i, j]
|
|
else:
|
|
# Set non-numeric data to 0
|
|
normalized_state[i, j] = 0.0
|
|
|
|
# Loop through each timeframe's features in the combined state
|
|
feature_count = state.shape[1] // len(self.timeframes)
|
|
|
|
for tf_idx in range(len(self.timeframes)):
|
|
start_idx = tf_idx * feature_count
|
|
end_idx = start_idx + feature_count
|
|
|
|
# Extract this timeframe's features
|
|
tf_features = normalized_state[:, start_idx:end_idx]
|
|
|
|
# Normalize OHLCV data by the first close price in the window
|
|
# This makes price movements relative rather than absolute
|
|
price_idx = 3 # Assuming close price is at index 3
|
|
if price_idx < tf_features.shape[1]:
|
|
reference_price = np.mean(tf_features[:, price_idx])
|
|
if reference_price != 0:
|
|
# Normalize price-related columns (OHLC)
|
|
for i in range(4): # First 4 columns are OHLC
|
|
if i < tf_features.shape[1]:
|
|
normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price
|
|
|
|
# Normalize volume using mean and std
|
|
vol_idx = 4 # Assuming volume is at index 4
|
|
if vol_idx < tf_features.shape[1]:
|
|
vol_mean = np.mean(tf_features[:, vol_idx])
|
|
vol_std = np.std(tf_features[:, vol_idx])
|
|
if vol_std > 0:
|
|
normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
|
|
else:
|
|
normalized_state[:, start_idx + vol_idx] = 0
|
|
|
|
# Other features (technical indicators) - normalize with min-max scaling
|
|
for i in range(5, feature_count):
|
|
if i < tf_features.shape[1]:
|
|
feature_min = np.min(tf_features[:, i])
|
|
feature_max = np.max(tf_features[:, i])
|
|
if feature_max > feature_min:
|
|
normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
|
|
else:
|
|
normalized_state[:, start_idx + i] = 0
|
|
|
|
return normalized_state
|
|
|
|
def replay(self, use_prioritized=True) -> float:
|
|
"""
|
|
Train on a batch of experiences with prioritized sampling
|
|
|
|
Args:
|
|
use_prioritized: Whether to use prioritized replay
|
|
|
|
Returns:
|
|
float: Loss value
|
|
"""
|
|
if len(self.memory) < self.batch_size:
|
|
return 0.0
|
|
|
|
# Sample batch with prioritization
|
|
batch = []
|
|
|
|
if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
|
|
# Prioritized sampling from different memory types
|
|
positive_count = min(self.batch_size // 4, len(self.positive_memory))
|
|
extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
|
|
regular_count = self.batch_size - positive_count - extrema_count
|
|
|
|
positive_samples = random.sample(list(self.positive_memory), positive_count)
|
|
extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
|
|
regular_samples = random.sample(list(self.memory), regular_count)
|
|
|
|
batch = positive_samples + extrema_samples + regular_samples
|
|
else:
|
|
# Standard sampling
|
|
batch = random.sample(self.memory, self.batch_size)
|
|
|
|
states, actions, rewards, next_states, dones = zip(*batch)
|
|
|
|
# Normalize states before training
|
|
normalized_states = np.array([self._normalize_state(state) for state in states])
|
|
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
|
|
|
|
# Convert to tensors and move to device
|
|
states_tensor = torch.FloatTensor(normalized_states).to(self.device)
|
|
actions_tensor = torch.LongTensor(actions).to(self.device)
|
|
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
|
|
next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
|
|
dones_tensor = torch.FloatTensor(dones).to(self.device)
|
|
|
|
# Get current Q values
|
|
current_q_values, extrema_pred = self.policy_net(states_tensor)
|
|
current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
|
|
|
|
# Get next Q values from target network (Double DQN approach)
|
|
with torch.no_grad():
|
|
# Get actions from policy network
|
|
next_actions, _ = self.policy_net(next_states_tensor)
|
|
next_actions = next_actions.max(1)[1].unsqueeze(1)
|
|
|
|
# Get Q values from target network for those actions
|
|
next_q_values, _ = self.target_net(next_states_tensor)
|
|
next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
|
|
|
|
# Compute target Q values
|
|
target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
|
|
|
|
# Clamp target values to prevent extreme values
|
|
target_q_values = torch.clamp(target_q_values, -100, 100)
|
|
|
|
# Compute Huber loss (more robust to outliers than MSE)
|
|
loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
|
|
|
|
# Optimize
|
|
self.optimizer.zero_grad()
|
|
loss.backward()
|
|
|
|
# Apply gradient clipping to prevent exploding gradients
|
|
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
|
|
|
|
self.optimizer.step()
|
|
|
|
# Update target network if needed
|
|
self.update_count += 1
|
|
if self.update_count % self.target_update == 0:
|
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
|
|
|
# Decay epsilon
|
|
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
|
|
|
return loss.item()
|
|
|
|
def train_on_extrema(self, states, actions, rewards, next_states, dones):
|
|
"""
|
|
Special training method focused on extrema patterns
|
|
|
|
Args:
|
|
states: Array of states near extrema points
|
|
actions: Correct actions to take (buy at bottoms, sell at tops)
|
|
rewards: Rewards for each action
|
|
next_states: Next states
|
|
dones: Done flags
|
|
"""
|
|
if len(states) == 0:
|
|
return 0.0
|
|
|
|
# Normalize states
|
|
normalized_states = np.array([self._normalize_state(state) for state in states])
|
|
normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
|
|
|
|
# Convert to tensors
|
|
states_tensor = torch.FloatTensor(normalized_states).to(self.device)
|
|
actions_tensor = torch.LongTensor(actions).to(self.device)
|
|
rewards_tensor = torch.FloatTensor(rewards).to(self.device)
|
|
next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
|
|
dones_tensor = torch.FloatTensor(dones).to(self.device)
|
|
|
|
# Forward pass
|
|
current_q_values, extrema_pred = self.policy_net(states_tensor)
|
|
current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
|
|
|
|
# Get next Q values (Double DQN approach)
|
|
with torch.no_grad():
|
|
next_actions, _ = self.policy_net(next_states_tensor)
|
|
next_actions = next_actions.max(1)[1].unsqueeze(1)
|
|
|
|
next_q_values, _ = self.target_net(next_states_tensor)
|
|
next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
|
|
|
|
target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
|
|
|
|
# Clamp target values
|
|
target_q_values = torch.clamp(target_q_values, -100, 100)
|
|
|
|
# Use Huber loss for extrema training
|
|
q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
|
|
|
|
# Full loss
|
|
loss = q_loss
|
|
|
|
# Optimize
|
|
self.optimizer.zero_grad()
|
|
loss.backward()
|
|
nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
|
|
self.optimizer.step()
|
|
|
|
return loss.item()
|
|
|
|
def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
|
|
"""Update learning metrics and perform learning rate adjustments if needed"""
|
|
# Update average reward with exponential moving average
|
|
if self.avg_reward == 0:
|
|
self.avg_reward = episode_reward
|
|
else:
|
|
self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward
|
|
|
|
# Check if we're making sufficient progress
|
|
if episode_reward > (1 + best_reward_threshold) * self.best_reward:
|
|
self.best_reward = episode_reward
|
|
self.no_improvement_count = 0
|
|
return True # Improved
|
|
else:
|
|
self.no_improvement_count += 1
|
|
|
|
# If no improvement for a while, adjust learning rate
|
|
if self.no_improvement_count >= 10:
|
|
current_lr = self.optimizer.param_groups[0]['lr']
|
|
new_lr = current_lr * 0.5
|
|
if new_lr >= 1e-6: # Don't reduce below minimum threshold
|
|
for param_group in self.optimizer.param_groups:
|
|
param_group['lr'] = new_lr
|
|
logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
|
|
self.no_improvement_count = 0
|
|
|
|
return False # No improvement
|
|
|
|
def save(self, path: str):
|
|
"""Save model and agent state"""
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
|
|
# Save policy network
|
|
self.policy_net.save(f"{path}_policy")
|
|
|
|
# Save target network
|
|
self.target_net.save(f"{path}_target")
|
|
|
|
# Save agent state
|
|
state = {
|
|
'epsilon': self.epsilon,
|
|
'update_count': self.update_count,
|
|
'losses': self.losses,
|
|
'optimizer_state': self.optimizer.state_dict(),
|
|
'best_reward': self.best_reward,
|
|
'avg_reward': self.avg_reward
|
|
}
|
|
|
|
torch.save(state, f"{path}_agent_state.pt")
|
|
logger.info(f"Agent state saved to {path}_agent_state.pt")
|
|
|
|
def load(self, path: str):
|
|
"""Load model and agent state"""
|
|
# Load policy network
|
|
self.policy_net.load(f"{path}_policy")
|
|
|
|
# Load target network
|
|
self.target_net.load(f"{path}_target")
|
|
|
|
# Load agent state
|
|
try:
|
|
agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
|
|
self.epsilon = agent_state['epsilon']
|
|
self.update_count = agent_state['update_count']
|
|
self.losses = agent_state['losses']
|
|
self.optimizer.load_state_dict(agent_state['optimizer_state'])
|
|
|
|
# Load additional metrics if they exist
|
|
if 'best_reward' in agent_state:
|
|
self.best_reward = agent_state['best_reward']
|
|
if 'avg_reward' in agent_state:
|
|
self.avg_reward = agent_state['avg_reward']
|
|
|
|
logger.info(f"Agent state loaded from {path}_agent_state.pt")
|
|
except FileNotFoundError:
|
|
logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values") |