gogo2/NN/train_enhanced.py
Dobromir Popov c0872248ab misc
2025-05-13 17:19:52 +03:00

585 lines
24 KiB
Python

import os
import sys
import time
import logging
import argparse
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import contextlib
from sklearn.model_selection import train_test_split
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import our enhanced agent
from NN.models.dqn_agent_enhanced import EnhancedDQNAgent
from NN.utils.data_interface import DataInterface
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('logs/enhanced_training.log')
]
)
logger = logging.getLogger(__name__)
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Train enhanced RL trading agent')
parser.add_argument('--episodes', type=int, default=100, help='Number of episodes to train')
parser.add_argument('--max-steps', type=int, default=2000, help='Maximum steps per episode')
parser.add_argument('--symbol', type=str, default='ETH/USDT', help='Trading symbol')
parser.add_argument('--no-gpu', action='store_true', help='Disable GPU usage')
parser.add_argument('--confidence', type=float, default=0.4, help='Confidence threshold')
parser.add_argument('--load-model', type=str, default='', help='Load existing model')
parser.add_argument('--batch-size', type=int, default=128, help='Training batch size')
parser.add_argument('--learning-rate', type=float, default=0.0003, help='Learning rate')
parser.add_argument('--no-pretrain', action='store_true', help='Skip pre-training')
parser.add_argument('--pretrain-epochs', type=int, default=20, help='Number of pre-training epochs')
return parser.parse_args()
def generate_price_prediction_training_data(data_1m, data_1h, data_1d, window_size=20):
"""
Generate labeled training data for price prediction pre-training
Args:
data_1m: 1-minute candle data
data_1h: 1-hour candle data
data_1d: 1-day candle data
window_size: Size of the observation window
Returns:
X, y_immediate, y_midterm, y_longterm, y_values
"""
logger.info("Generating price prediction training data")
# Features to use
ohlcv_columns = ['open', 'high', 'low', 'close', 'volume']
# Create feature sets
X = []
y_immediate = [] # 1m prediction (next 5min)
y_midterm = [] # 1h prediction (next few hours)
y_longterm = [] # 1d prediction (next day)
y_values = [] # % change for each timeframe
# Need enough data for all timeframes
if len(data_1m) < window_size + 5 or len(data_1h) < 2 or len(data_1d) < 2:
logger.error("Not enough data for all timeframes")
return np.array([]), np.array([]), np.array([]), np.array([]), np.array([])
# Generate examples
for i in range(window_size, len(data_1m) - 5):
# Skip if we can't align with higher timeframes
if i % 60 != 0: # Only use minutes that align with hour boundaries
continue
try:
# Get window of 1m data as input
window_1m = data_1m[i-window_size:i][ohlcv_columns].values
# Find corresponding indices in higher timeframes
curr_timestamp = data_1m.index[i]
h_idx = data_1h.index.get_indexer([curr_timestamp], method='nearest')[0]
d_idx = data_1d.index.get_indexer([curr_timestamp], method='nearest')[0]
# Skip if indices are out of bounds
if h_idx < 0 or h_idx >= len(data_1h) - 1 or d_idx < 0 or d_idx >= len(data_1d) - 1:
continue
# Get future prices for label generation
future_5m = data_1m[i+5]['close']
future_1h = data_1h[h_idx+1]['close']
future_1d = data_1d[d_idx+1]['close']
current_price = data_1m[i]['close']
# Calculate % change for each timeframe
change_5m = (future_5m - current_price) / current_price * 100
change_1h = (future_1h - current_price) / current_price * 100
change_1d = (future_1d - current_price) / current_price * 100
# Determine price direction (0=down, 1=sideways, 2=up)
def get_direction(change):
if change < -0.5: # Down if less than -0.5%
return 0
elif change > 0.5: # Up if more than 0.5%
return 2
else: # Sideways if between -0.5% and 0.5%
return 1
direction_5m = get_direction(change_5m)
direction_1h = get_direction(change_1h)
direction_1d = get_direction(change_1d)
# Add to dataset
X.append(window_1m.flatten())
y_immediate.append(direction_5m)
y_midterm.append(direction_1h)
y_longterm.append(direction_1d)
y_values.append([change_5m, change_1h, change_1d, 0]) # Last value reserved
except Exception as e:
logger.warning(f"Error generating training example at index {i}: {str(e)}")
# Convert to numpy arrays
X = np.array(X)
y_immediate = np.array(y_immediate)
y_midterm = np.array(y_midterm)
y_longterm = np.array(y_longterm)
y_values = np.array(y_values)
logger.info(f"Generated {len(X)} training examples")
logger.info(f"Class distribution - Immediate: {np.bincount(y_immediate)}, "
f"Midterm: {np.bincount(y_midterm)}, Long-term: {np.bincount(y_longterm)}")
return X, y_immediate, y_midterm, y_longterm, y_values
def pretrain_price_prediction(agent, data_interface, n_epochs=20, batch_size=128, device=None):
"""
Pre-train the price prediction capabilities of the agent
Args:
agent: EnhancedDQNAgent instance
data_interface: DataInterface instance
n_epochs: Number of pre-training epochs
batch_size: Batch size for pre-training
device: Device to use for pre-training
Returns:
The pre-trained agent
"""
logger.info("Starting price prediction pre-training")
try:
# Ensure we have the necessary timeframes
timeframes_needed = ['1m', '1h', '1d']
for tf in timeframes_needed:
if tf not in data_interface.timeframes:
logger.info(f"Adding timeframe {tf} for pre-training")
# Add timeframe to the list if not present
if tf not in data_interface.timeframes:
data_interface.timeframes.append(tf)
data_interface.dataframes[tf] = None
# Get data for each timeframe
data_1m = data_interface.get_historical_data(timeframe='1m')
data_1h = data_interface.get_historical_data(timeframe='1h')
data_1d = data_interface.get_historical_data(timeframe='1d')
# Generate labeled training data
X, y_immediate, y_midterm, y_longterm, y_values = generate_price_prediction_training_data(
data_1m, data_1h, data_1d, window_size=20
)
if len(X) == 0:
logger.error("No training examples generated. Skipping pre-training.")
return agent
# Split data into training and validation sets
X_train, X_val, y_imm_train, y_imm_val, y_mid_train, y_mid_val, y_long_train, y_long_val, y_val_train, y_val_val = train_test_split(
X, y_immediate, y_midterm, y_longterm, y_values, test_size=0.2, random_state=42
)
# Convert to torch tensors
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_imm_train_tensor = torch.LongTensor(y_imm_train).to(device)
y_mid_train_tensor = torch.LongTensor(y_mid_train).to(device)
y_long_train_tensor = torch.LongTensor(y_long_train).to(device)
y_val_train_tensor = torch.FloatTensor(y_val_train).to(device)
X_val_tensor = torch.FloatTensor(X_val).to(device)
y_imm_val_tensor = torch.LongTensor(y_imm_val).to(device)
y_mid_val_tensor = torch.LongTensor(y_mid_val).to(device)
y_long_val_tensor = torch.LongTensor(y_long_val).to(device)
y_val_val_tensor = torch.FloatTensor(y_val_val).to(device)
# Calculate class weights for imbalanced data
def get_class_weights(labels):
counts = np.bincount(labels)
if len(counts) < 3: # Ensure we have 3 classes
counts = np.append(counts, [0] * (3 - len(counts)))
weights = 1.0 / np.array(counts)
weights = weights / np.sum(weights) # Normalize
return weights
imm_weights = torch.FloatTensor(get_class_weights(y_imm_train)).to(device)
mid_weights = torch.FloatTensor(get_class_weights(y_mid_train)).to(device)
long_weights = torch.FloatTensor(get_class_weights(y_long_train)).to(device)
# Create DataLoader for batch training
train_dataset = TensorDataset(
X_train_tensor, y_imm_train_tensor, y_mid_train_tensor,
y_long_train_tensor, y_val_train_tensor
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Set up loss functions with class weights
imm_criterion = nn.CrossEntropyLoss(weight=imm_weights)
mid_criterion = nn.CrossEntropyLoss(weight=mid_weights)
long_criterion = nn.CrossEntropyLoss(weight=long_weights)
value_criterion = nn.MSELoss()
# Set up optimizer (separate from agent's optimizer)
pretrain_optimizer = torch.optim.Adam(agent.policy_net.parameters(), lr=0.0002)
pretrain_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
pretrain_optimizer, mode='min', factor=0.5, patience=3, verbose=True
)
# Set model to training mode
agent.policy_net.train()
# Training loop
best_val_loss = float('inf')
patience = 5
patience_counter = 0
# Create TensorBoard writer for pre-training
writer = SummaryWriter(log_dir=f'runs/pretrain_{int(time.time())}')
for epoch in range(n_epochs):
# Training phase
train_loss = 0.0
imm_correct, mid_correct, long_correct = 0, 0, 0
total = 0
for X_batch, y_imm_batch, y_mid_batch, y_long_batch, y_val_batch in train_loader:
# Zero gradients
pretrain_optimizer.zero_grad()
# Forward pass
with torch.cuda.amp.autocast() if agent.use_mixed_precision else contextlib.nullcontext():
q_values, _, price_preds, _ = agent.policy_net(X_batch)
# Calculate losses for each prediction head
imm_loss = imm_criterion(price_preds['immediate'], y_imm_batch)
mid_loss = mid_criterion(price_preds['midterm'], y_mid_batch)
long_loss = long_criterion(price_preds['longterm'], y_long_batch)
value_loss = value_criterion(price_preds['values'], y_val_batch)
# Combined loss (weighted by importance)
total_loss = imm_loss + 0.7 * mid_loss + 0.5 * long_loss + 0.3 * value_loss
# Backward pass and optimize
if agent.use_mixed_precision:
agent.scaler.scale(total_loss).backward()
agent.scaler.unscale_(pretrain_optimizer)
torch.nn.utils.clip_grad_norm_(agent.policy_net.parameters(), 1.0)
agent.scaler.step(pretrain_optimizer)
agent.scaler.update()
else:
total_loss.backward()
torch.nn.utils.clip_grad_norm_(agent.policy_net.parameters(), 1.0)
pretrain_optimizer.step()
# Accumulate metrics
train_loss += total_loss.item()
total += X_batch.size(0)
# Calculate accuracy
_, imm_pred = torch.max(price_preds['immediate'], 1)
_, mid_pred = torch.max(price_preds['midterm'], 1)
_, long_pred = torch.max(price_preds['longterm'], 1)
imm_correct += (imm_pred == y_imm_batch).sum().item()
mid_correct += (mid_pred == y_mid_batch).sum().item()
long_correct += (long_pred == y_long_batch).sum().item()
# Calculate epoch metrics
train_loss /= len(train_loader)
imm_acc = imm_correct / total
mid_acc = mid_correct / total
long_acc = long_correct / total
# Validation phase
agent.policy_net.eval()
val_loss = 0.0
imm_val_correct, mid_val_correct, long_val_correct = 0, 0, 0
with torch.no_grad():
# Forward pass on validation data
q_values, _, val_price_preds, _ = agent.policy_net(X_val_tensor)
# Calculate validation losses
val_imm_loss = imm_criterion(val_price_preds['immediate'], y_imm_val_tensor)
val_mid_loss = mid_criterion(val_price_preds['midterm'], y_mid_val_tensor)
val_long_loss = long_criterion(val_price_preds['longterm'], y_long_val_tensor)
val_value_loss = value_criterion(val_price_preds['values'], y_val_val_tensor)
val_total_loss = val_imm_loss + 0.7 * val_mid_loss + 0.5 * val_long_loss + 0.3 * val_value_loss
val_loss = val_total_loss.item()
# Calculate validation accuracy
_, imm_val_pred = torch.max(val_price_preds['immediate'], 1)
_, mid_val_pred = torch.max(val_price_preds['midterm'], 1)
_, long_val_pred = torch.max(val_price_preds['longterm'], 1)
imm_val_correct = (imm_val_pred == y_imm_val_tensor).sum().item()
mid_val_correct = (mid_val_pred == y_mid_val_tensor).sum().item()
long_val_correct = (long_val_pred == y_long_val_tensor).sum().item()
imm_val_acc = imm_val_correct / len(X_val_tensor)
mid_val_acc = mid_val_correct / len(X_val_tensor)
long_val_acc = long_val_correct / len(X_val_tensor)
# Log to TensorBoard
writer.add_scalar('pretrain/train_loss', train_loss, epoch)
writer.add_scalar('pretrain/val_loss', val_loss, epoch)
writer.add_scalar('pretrain/imm_acc', imm_acc, epoch)
writer.add_scalar('pretrain/mid_acc', mid_acc, epoch)
writer.add_scalar('pretrain/long_acc', long_acc, epoch)
writer.add_scalar('pretrain/imm_val_acc', imm_val_acc, epoch)
writer.add_scalar('pretrain/mid_val_acc', mid_val_acc, epoch)
writer.add_scalar('pretrain/long_val_acc', long_val_acc, epoch)
# Learning rate scheduling
pretrain_scheduler.step(val_loss)
# Early stopping check
if val_loss < best_val_loss:
best_val_loss = val_loss
patience_counter = 0
# Copy policy_net weights to target_net
agent.target_net.load_state_dict(agent.policy_net.state_dict())
logger.info(f"Saved best model with validation loss: {val_loss:.4f}")
# Save pre-trained model
agent.save("NN/models/saved/enhanced_dqn_pretrained")
else:
patience_counter += 1
if patience_counter >= patience:
logger.info(f"Early stopping triggered after {epoch+1} epochs")
break
# Log progress
logger.info(f"Epoch {epoch+1}/{n_epochs}: "
f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
f"Imm Acc: {imm_acc:.4f}/{imm_val_acc:.4f}, "
f"Mid Acc: {mid_acc:.4f}/{mid_val_acc:.4f}, "
f"Long Acc: {long_acc:.4f}/{long_val_acc:.4f}")
# Set model back to training mode for next epoch
agent.policy_net.train()
writer.close()
logger.info("Price prediction pre-training complete")
return agent
except Exception as e:
logger.error(f"Error during price prediction pre-training: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return agent
def train_enhanced_rl(args):
"""
Train the enhanced RL agent for trading
Args:
args: Command line arguments
"""
# Setup device
if args.no_gpu:
device = torch.device('cpu')
else:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")
# Set up data interface
data_interface = DataInterface(symbol=args.symbol, timeframes=['1m', '5m', '15m'])
# Fetch historical data for each timeframe
for timeframe in data_interface.timeframes:
df = data_interface.get_historical_data(timeframe=timeframe)
logger.info(f"Using data for {args.symbol} {timeframe} ({len(data_interface.dataframes[timeframe])} candles)")
# Create environment for training
from NN.environments.trading_env import TradingEnvironment
window_size = 20
train_env = TradingEnvironment(
data_interface=data_interface,
initial_balance=10000.0,
transaction_fee=0.0002,
window_size=window_size,
max_position=1.0,
reward_scaling=100.0
)
# Create agent with improved parameters
state_shape = train_env.observation_space.shape
n_actions = train_env.action_space.n
agent = EnhancedDQNAgent(
state_shape=state_shape,
n_actions=n_actions,
learning_rate=args.learning_rate,
gamma=0.95,
epsilon=1.0,
epsilon_min=0.05,
epsilon_decay=0.995,
buffer_size=50000,
batch_size=args.batch_size,
target_update=10,
confidence_threshold=args.confidence,
device=device
)
# Load existing model if specified
if args.load_model:
model_path = args.load_model
if agent.load(model_path):
logger.info(f"Loaded existing model from {model_path}")
else:
logger.error(f"Error loading model from {model_path}")
# Pre-training for price prediction
if not args.no_pretrain and not args.load_model:
logger.info("Starting pre-training phase")
agent = pretrain_price_prediction(
agent=agent,
data_interface=data_interface,
n_epochs=args.pretrain_epochs,
batch_size=args.batch_size,
device=device
)
logger.info("Pre-training completed")
# Setup TensorBoard
writer = SummaryWriter(log_dir=f'runs/enhanced_rl_{int(time.time())}')
# Log hardware info
writer.add_text("hardware/device", str(device), 0)
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)
# Move agent to device
agent.move_models_to_device(device)
# Training loop
logger.info(f"Starting enhanced training for {args.episodes} episodes")
total_rewards = []
episode_losses = []
trade_win_rates = []
best_reward = -np.inf
try:
for episode in range(args.episodes):
# Reset environment for new episode
state = train_env.reset()
total_reward = 0.0
done = False
step = 0
episode_start_time = time.time()
# Track trade statistics
trades = []
wins = 0
losses = 0
# Run episode
while not done and step < args.max_steps:
# Choose action
action, confidence = agent.act(state)
# Take action in environment
next_state, reward, done, info = train_env.step(action)
# Remember experience
agent.remember(state, action, reward, next_state, done)
# Track trade results
if 'trade_result' in info and info['trade_result'] is not None:
trade_result = info['trade_result']
trade_pnl = trade_result['pnl']
trades.append(trade_pnl)
if trade_pnl > 0:
wins += 1
logger.info(f"Profitable trade! {trade_pnl:.2f}% profit, reward: {reward:.4f}")
else:
losses += 1
logger.info(f"Loss trade! {trade_pnl:.2f}% loss, penalty: {reward:.4f}")
# Update state and counters
state = next_state
total_reward += reward
step += 1
# Train agent
loss = agent.replay()
if loss > 0:
episode_losses.append(loss)
# Log training metrics for each episode
episode_time = time.time() - episode_start_time
total_rewards.append(total_reward)
# Calculate win rate
win_rate = wins / max(1, (wins + losses))
trade_win_rates.append(win_rate)
# Log to console and TensorBoard
logger.info(f"Episode {episode}/{args.episodes} - Reward: {total_reward:.4f}, Win Rate: {win_rate:.2f}, "
f"Trades: {len(trades)}, Balance: ${train_env.balance:.2f}, Epsilon: {agent.epsilon:.4f}, "
f"Time: {episode_time:.2f}s")
writer.add_scalar('metrics/reward', total_reward, episode)
writer.add_scalar('metrics/balance', train_env.balance, episode)
writer.add_scalar('metrics/win_rate', win_rate, episode)
writer.add_scalar('metrics/trades', len(trades), episode)
writer.add_scalar('metrics/epsilon', agent.epsilon, episode)
if episode_losses:
avg_loss = sum(episode_losses) / len(episode_losses)
writer.add_scalar('metrics/loss', avg_loss, episode)
# Check if this is the best model so far
if total_reward > best_reward:
best_reward = total_reward
# Save best model
agent.save(f"NN/models/saved/enhanced_dqn_best")
logger.info(f"New best model saved with reward: {best_reward:.4f}")
# Save checkpoint every 10 episodes
if episode % 10 == 0 and episode > 0:
agent.save(f"NN/models/saved/enhanced_dqn_checkpoint")
logger.info(f"Checkpoint saved at episode {episode}")
# Reset episode losses
episode_losses = []
# Final save
agent.save(f"NN/models/saved/enhanced_dqn_final")
logger.info("Enhanced training completed, final model saved")
except KeyboardInterrupt:
logger.info("Training interrupted by user")
except Exception as e:
logger.error(f"Training failed: {str(e)}")
import traceback
logger.error(traceback.format_exc())
finally:
# Close TensorBoard writer
writer.close()
return agent, train_env
if __name__ == "__main__":
# Create logs directory if it doesn't exist
os.makedirs("logs", exist_ok=True)
os.makedirs("NN/models/saved", exist_ok=True)
# Parse arguments
args = parse_args()
# Start training
train_enhanced_rl(args)