gogo2/NN/train_rl.py
Dobromir Popov 4eac14022c RL training
2025-03-31 03:31:54 +03:00

192 lines
6.6 KiB
Python

import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import logging
import time
from datetime import datetime
import os
import sys
import pandas as pd
import gym
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from NN.utils.data_interface import DataInterface
from NN.utils.trading_env import TradingEnvironment
from NN.models.dqn_agent import DQNAgent
from NN.utils.signal_interpreter import SignalInterpreter
# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('rl_training.log'),
logging.StreamHandler()
]
)
class RLTradingEnvironment(TradingEnvironment):
"""Extended trading environment that reshapes state for CNN"""
def __init__(self, data, window_size, num_features, num_timeframes, **kwargs):
# Set attributes before parent initialization
self.window_size = window_size
self.num_features = num_features
self.num_timeframes = num_timeframes
self.feature_dim = num_features * num_timeframes
# Initialize parent class
super().__init__(data=data, **kwargs)
# Update observation space for CNN
self.observation_space = gym.spaces.Box(
low=-np.inf,
high=np.inf,
shape=(self.window_size, self.feature_dim),
dtype=np.float32
)
def _get_observation(self):
"""Get current observation reshaped for CNN"""
# Get flattened observation from parent class
flat_obs = super()._get_observation()
# Extract features (exclude close price)
features = flat_obs[:-1] # Remove close price
# Calculate number of complete windows
n_windows = len(features) // self.feature_dim
if n_windows < self.window_size:
# Pad with zeros if not enough data
padding = np.zeros((self.window_size - n_windows, self.feature_dim))
reshaped = np.vstack([
padding,
features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim)
])
else:
# Take the most recent window_size windows
start_idx = (n_windows - self.window_size) * self.feature_dim
reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim)
return reshaped.astype(np.float32)
def train_rl():
"""
Train the RL model using the DQN agent
"""
# Initialize data interface with BTC/USDT and multiple timeframes
timeframes = ['1m', '5m', '15m']
window_size = 20
data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes)
# Get training data
X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data()
if X_train is None:
logger.error("Failed to get training data")
return
# Calculate feature dimensions
num_features = X_train.shape[2] # Number of features per timeframe
total_features = num_features * len(timeframes) # Total features across all timeframes
# Flatten features for environment
n_samples = X_train.shape[0]
flattened_features = X_train.reshape(n_samples, window_size, -1) # Reshape to (batch, window, features)
# Create DataFrame with features as separate columns
feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])]
df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size)
df['close'] = train_prices
# Create environment
env = RLTradingEnvironment(
data=df,
window_size=window_size,
num_features=num_features,
num_timeframes=len(timeframes),
initial_balance=10000,
fee_rate=0.001,
max_steps=1000
)
# Create DQN agent
agent = DQNAgent(
state_size=window_size, # First dimension of observation space
action_size=env.action_space.n,
window_size=window_size,
num_features=num_features,
timeframes=timeframes,
learning_rate=0.001,
gamma=0.99,
epsilon=1.0,
epsilon_min=0.01,
epsilon_decay=0.995,
memory_size=10000,
batch_size=32,
target_update=10
)
# Training parameters
episodes = 1000
max_steps = 1000
best_reward = float('-inf')
best_model_path = 'NN/models/saved/best_rl_model.pth'
# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
# Training loop
for episode in range(episodes):
state = env.reset()
total_reward = 0
for step in range(max_steps):
# Get action from agent
action = agent.act(state)
# Take action in environment
next_state, reward, done, info = env.step(action)
# Store experience in agent's memory
agent.remember(state, action, reward, next_state, done)
# Train agent
if len(agent.memory) > agent.batch_size:
loss = agent.replay()
if loss is not None:
logger.debug(f"Training loss: {loss:.4f}")
# Update state and reward
state = next_state
total_reward += reward
if done:
break
# Update epsilon
agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
# Log episode results
logger.info(f"Episode: {episode + 1}/{episodes}")
logger.info(f"Total Reward: {total_reward:.2f}")
logger.info(f"Final Balance: {info['balance']:.2f}")
logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}")
logger.info(f"Win Rate: {info['win_rate']:.2%}")
logger.info(f"Epsilon: {agent.epsilon:.4f}")
# Save best model
if total_reward > best_reward:
best_reward = total_reward
agent.save(best_model_path)
logger.info(f"New best model saved with reward: {best_reward:.2f}")
# Save checkpoint every 100 episodes
if (episode + 1) % 100 == 0:
checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth'
agent.save(checkpoint_path)
logger.info(f"Checkpoint saved at episode {episode + 1}")
if __name__ == "__main__":
train_rl()