RL training

This commit is contained in:
Dobromir Popov
2025-03-31 03:31:54 +03:00
parent 1610d5bd49
commit 4eac14022c
9 changed files with 1492 additions and 247 deletions

192
NN/train_rl.py Normal file
View File

@ -0,0 +1,192 @@
import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import logging
import time
from datetime import datetime
import os
import sys
import pandas as pd
import gym
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from NN.utils.data_interface import DataInterface
from NN.utils.trading_env import TradingEnvironment
from NN.models.dqn_agent import DQNAgent
from NN.utils.signal_interpreter import SignalInterpreter
# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('rl_training.log'),
logging.StreamHandler()
]
)
class RLTradingEnvironment(TradingEnvironment):
"""Extended trading environment that reshapes state for CNN"""
def __init__(self, data, window_size, num_features, num_timeframes, **kwargs):
# Set attributes before parent initialization
self.window_size = window_size
self.num_features = num_features
self.num_timeframes = num_timeframes
self.feature_dim = num_features * num_timeframes
# Initialize parent class
super().__init__(data=data, **kwargs)
# Update observation space for CNN
self.observation_space = gym.spaces.Box(
low=-np.inf,
high=np.inf,
shape=(self.window_size, self.feature_dim),
dtype=np.float32
)
def _get_observation(self):
"""Get current observation reshaped for CNN"""
# Get flattened observation from parent class
flat_obs = super()._get_observation()
# Extract features (exclude close price)
features = flat_obs[:-1] # Remove close price
# Calculate number of complete windows
n_windows = len(features) // self.feature_dim
if n_windows < self.window_size:
# Pad with zeros if not enough data
padding = np.zeros((self.window_size - n_windows, self.feature_dim))
reshaped = np.vstack([
padding,
features[-(n_windows * self.feature_dim):].reshape(n_windows, self.feature_dim)
])
else:
# Take the most recent window_size windows
start_idx = (n_windows - self.window_size) * self.feature_dim
reshaped = features[start_idx:].reshape(self.window_size, self.feature_dim)
return reshaped.astype(np.float32)
def train_rl():
"""
Train the RL model using the DQN agent
"""
# Initialize data interface with BTC/USDT and multiple timeframes
timeframes = ['1m', '5m', '15m']
window_size = 20
data_interface = DataInterface(symbol="BTC/USDT", timeframes=timeframes)
# Get training data
X_train, y_train, X_val, y_val, train_prices, val_prices = data_interface.prepare_training_data()
if X_train is None:
logger.error("Failed to get training data")
return
# Calculate feature dimensions
num_features = X_train.shape[2] # Number of features per timeframe
total_features = num_features * len(timeframes) # Total features across all timeframes
# Flatten features for environment
n_samples = X_train.shape[0]
flattened_features = X_train.reshape(n_samples, window_size, -1) # Reshape to (batch, window, features)
# Create DataFrame with features as separate columns
feature_columns = [f'feature_{i}' for i in range(flattened_features.shape[2])]
df = pd.DataFrame(flattened_features.reshape(n_samples, -1), columns=feature_columns * window_size)
df['close'] = train_prices
# Create environment
env = RLTradingEnvironment(
data=df,
window_size=window_size,
num_features=num_features,
num_timeframes=len(timeframes),
initial_balance=10000,
fee_rate=0.001,
max_steps=1000
)
# Create DQN agent
agent = DQNAgent(
state_size=window_size, # First dimension of observation space
action_size=env.action_space.n,
window_size=window_size,
num_features=num_features,
timeframes=timeframes,
learning_rate=0.001,
gamma=0.99,
epsilon=1.0,
epsilon_min=0.01,
epsilon_decay=0.995,
memory_size=10000,
batch_size=32,
target_update=10
)
# Training parameters
episodes = 1000
max_steps = 1000
best_reward = float('-inf')
best_model_path = 'NN/models/saved/best_rl_model.pth'
# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
# Training loop
for episode in range(episodes):
state = env.reset()
total_reward = 0
for step in range(max_steps):
# Get action from agent
action = agent.act(state)
# Take action in environment
next_state, reward, done, info = env.step(action)
# Store experience in agent's memory
agent.remember(state, action, reward, next_state, done)
# Train agent
if len(agent.memory) > agent.batch_size:
loss = agent.replay()
if loss is not None:
logger.debug(f"Training loss: {loss:.4f}")
# Update state and reward
state = next_state
total_reward += reward
if done:
break
# Update epsilon
agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
# Log episode results
logger.info(f"Episode: {episode + 1}/{episodes}")
logger.info(f"Total Reward: {total_reward:.2f}")
logger.info(f"Final Balance: {info['balance']:.2f}")
logger.info(f"Max Drawdown: {info['max_drawdown']:.2%}")
logger.info(f"Win Rate: {info['win_rate']:.2%}")
logger.info(f"Epsilon: {agent.epsilon:.4f}")
# Save best model
if total_reward > best_reward:
best_reward = total_reward
agent.save(best_model_path)
logger.info(f"New best model saved with reward: {best_reward:.2f}")
# Save checkpoint every 100 episodes
if (episode + 1) % 100 == 0:
checkpoint_path = f'NN/models/saved/rl_model_episode_{episode + 1}.pth'
agent.save(checkpoint_path)
logger.info(f"Checkpoint saved at episode {episode + 1}")
if __name__ == "__main__":
train_rl()