gogo2/NN/train_rl.py

import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import logging
import time
from datetime import datetime
import os
import sys
import pandas as pd
import gym
import json

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from NN.utils.data_interface import DataInterface
from NN.utils.trading_env import TradingEnvironment
from NN.models.dqn_agent import DQNAgent
from NN.utils.signal_interpreter import SignalInterpreter

# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('rl_training.log'),
        logging.StreamHandler()
    ]
)

# Set up device for PyTorch (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Log GPU status
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
    logger.info(f"Using GPU: {gpu_names}")

    # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
    if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
        logger.info("BFloat16 precision is supported - will use for faster training")
else:
    logger.warning("GPU not available. Using CPU for training (slower).")

class RLTradingEnvironment(gym.Env):
    """
    Reinforcement Learning environment for trading with technical indicators
    from multiple timeframes
    """
    def __init__(self, features_1m, features_5m, features_15m, window_size=20, trading_fee=0.0025, min_trade_interval=15):
        super().__init__()

        # Initialize attributes before parent class
        self.window_size = window_size
        self.num_features = features_1m.shape[1] - 1  # Exclude close price
        self.num_timeframes = 3  # 1m, 5m, 15m
        self.feature_dim = self.num_features * self.num_timeframes

        # Store features from different timeframes
        self.features_1m = features_1m
        self.features_5m = features_5m
        self.features_15m = features_15m

        # Trading parameters
        self.initial_balance = 1.0
        self.trading_fee = trading_fee  # Increased from 0.001 to 0.0025 (0.25%)
        self.min_trade_interval = min_trade_interval  # Minimum steps between trades

        # Define action and observation spaces
        self.action_space = gym.spaces.Discrete(3)  # 0: Buy, 1: Sell, 2: Hold
        self.observation_space = gym.spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size, self.feature_dim),
            dtype=np.float32
        )

        # State variables
        self.reset()

        # Callback for visualization or external monitoring
        self.action_callback = None

    def reset(self):
        """Reset the environment to initial state"""
        self.balance = self.initial_balance
        self.position = 0.0  # Amount of asset held
        self.current_step = self.window_size
        self.trades = 0
        self.wins = 0
        self.losses = 0
        self.trade_history = []
        self.last_trade_step = -self.min_trade_interval  # Initialize to allow immediate first trade

        # Get initial observation
        observation = self._get_observation()
        return observation

    def _get_observation(self):
        """
        Get the current state observation.
        Combine features from multiple timeframes, reshaped for the CNN.
        """
        # Calculate indices for each timeframe
        idx_1m = self.current_step
        idx_5m = idx_1m // 5
        idx_15m = idx_1m // 15

        # Extract feature windows from each timeframe
        window_1m = self.features_1m[idx_1m - self.window_size:idx_1m]

        # Handle 5m timeframe
        start_5m = max(0, idx_5m - self.window_size)
        window_5m = self.features_5m[start_5m:idx_5m]

        # Handle 15m timeframe
        start_15m = max(0, idx_15m - self.window_size)
        window_15m = self.features_15m[start_15m:idx_15m]

        # Pad if needed (for 5m and 15m)
        if len(window_5m) < self.window_size:
            padding = np.zeros((self.window_size - len(window_5m), window_5m.shape[1]))
            window_5m = np.vstack([padding, window_5m])

        if len(window_15m) < self.window_size:
            padding = np.zeros((self.window_size - len(window_15m), window_15m.shape[1]))
            window_15m = np.vstack([padding, window_15m])

        # Combine features from all timeframes
        combined_features = np.hstack([
            window_1m.reshape(self.window_size, -1),
            window_5m.reshape(self.window_size, -1),
            window_15m.reshape(self.window_size, -1)
        ])

        # Convert to float32 and handle any NaN values
        combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32)

        return combined_features

    def step(self, action):
        """
        Take an action in the environment and return the next state, reward, done flag, and info

        Args:
            action (int): 0 = Buy, 1 = Sell, 2 = Hold

        Returns:
            tuple: (observation, reward, done, info)
        """
        # Get current and next price
        current_price = self.features_1m[self.current_step, -1]  # Close price is last column
        next_price = self.features_1m[self.current_step + 1, -1]

        # Handle zero or negative prices
        if current_price <= 0:
            current_price = 1e-8  # Small positive number
        if next_price <= 0:
            next_price = current_price  # Use current price if next price is invalid

        price_change = (next_price - current_price) / current_price

        # Default reward is slightly negative to discourage inaction
        reward = -0.0001
        done = False
        profit_pct = None  # Initialize profit_pct variable

        # Check if enough time has passed since last trade
        trade_interval = self.current_step - self.last_trade_step
        trade_interval_penalty = 0

        # Execute action
        if action == 0:  # BUY
            if self.position == 0:  # Only buy if not already in position
                # Apply extra penalty for trading too frequently
                if trade_interval < self.min_trade_interval:
                    trade_interval_penalty = -0.002 * (self.min_trade_interval - trade_interval)
                    # Still allow the trade but with penalty

                self.position = self.balance * (1 - self.trading_fee)
                self.balance = 0
                self.trades += 1
                reward = -0.001 + trade_interval_penalty  # Small cost for transaction + potential penalty
                self.trade_entry_price = current_price
                self.last_trade_step = self.current_step

        elif action == 1:  # SELL
            if self.position > 0:  # Only sell if in position
                # Apply extra penalty for trading too frequently
                if trade_interval < self.min_trade_interval:
                    trade_interval_penalty = -0.002 * (self.min_trade_interval - trade_interval)
                    # Still allow the trade but with penalty

                # Calculate position value at current price
                position_value = self.position * (1 + price_change)
                self.balance = position_value * (1 - self.trading_fee)

                # Calculate profit/loss from trade
                profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
                # Scale reward by profit percentage and apply trade interval penalty
                reward = (profit_pct * 10) + trade_interval_penalty

                # Update win/loss count
                if profit_pct > 0:
                    self.wins += 1
                else:
                    self.losses += 1

                # Record trade
                self.trade_history.append({
                    'entry_price': self.trade_entry_price,
                    'exit_price': next_price,
                    'profit_pct': profit_pct,
                    'trade_interval': trade_interval
                })

                # Reset position and update last trade step
                self.position = 0
                self.last_trade_step = self.current_step

        # else: (action == 2 - HOLD) - no position change

        # Move to next step
        self.current_step += 1

        # Check if done
        if self.current_step >= len(self.features_1m) - 1:
            done = True

            # Apply final evaluation
            if self.position > 0:
                # Force close position at the end
                position_value = self.position * (1 + price_change)
                self.balance = position_value * (1 - self.trading_fee)
                profit_pct = (next_price - self.trade_entry_price) / self.trade_entry_price
                reward += profit_pct * 10

                # Update win/loss count
                if profit_pct > 0:
                    self.wins += 1
                else:
                    self.losses += 1

        # Get the next observation
        observation = self._get_observation()

        # Calculate metrics for info
        total_value = self.balance + self.position * next_price
        gain = (total_value - self.initial_balance) / self.initial_balance
        self.win_rate = self.wins / max(1, self.trades)

        info = {
            'balance': self.balance,
            'position': self.position,
            'total_value': total_value,
            'gain': gain,
            'trades': self.trades,
            'win_rate': self.win_rate,
            'profit_pct': profit_pct if action == 1 and self.position == 0 else None,
            'current_price': current_price,
            'next_price': next_price
        }

        # Call the callback if it exists
        if self.action_callback:
            self.action_callback(action, current_price, reward, info)

        return observation, reward, done, info

    def set_action_callback(self, callback):
        """
        Set a callback function to be called after each action

        Args:
            callback: Function with signature (action, price, reward, info)
        """
        self.action_callback = callback

def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent",
             action_callback=None, episode_callback=None, symbol="BTC/USDT"):
    """
    Train a reinforcement learning agent for trading

    Args:
        env_class: Optional environment class override
        num_episodes: Number of episodes to train for
        max_steps: Maximum steps per episode
        save_path: Path to save the trained model
        action_callback: Callback function for monitoring actions
        episode_callback: Callback function for monitoring episodes
        symbol: Trading symbol to use

    Returns:
        tuple: (trained agent, environment)
    """
    # Load data for the selected symbol
    data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])

    try:
        # Try to load data for the requested symbol using get_historical_data method
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)

        if data_1m is None or data_5m is None or data_15m is None:
            raise FileNotFoundError("Could not retrieve data for specified symbol")
    except Exception as e:
        logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.")
        # Try to use cached data if available
        symbol = "BTC/USDT"
        data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)

        if data_1m is None or data_5m is None or data_15m is None:
            logger.error("Failed to retrieve any data. Cannot continue training.")
            raise ValueError("No data available for training")

    # Create features from the data by adding technical indicators and converting to numpy format
    if data_1m is not None:
        data_1m = data_interface.add_technical_indicators(data_1m)
        # Convert to numpy array with close price as the last column
        features_1m = np.hstack([
            data_1m.drop(['timestamp', 'close'], axis=1).values,
            data_1m['close'].values.reshape(-1, 1)
        ])
    else:
        features_1m = None

    if data_5m is not None:
        data_5m = data_interface.add_technical_indicators(data_5m)
        # Convert to numpy array with close price as the last column
        features_5m = np.hstack([
            data_5m.drop(['timestamp', 'close'], axis=1).values,
            data_5m['close'].values.reshape(-1, 1)
        ])
    else:
        features_5m = None

    if data_15m is not None:
        data_15m = data_interface.add_technical_indicators(data_15m)
        # Convert to numpy array with close price as the last column
        features_15m = np.hstack([
            data_15m.drop(['timestamp', 'close'], axis=1).values,
            data_15m['close'].values.reshape(-1, 1)
        ])
    else:
        features_15m = None

    # Check if we have all the required features
    if features_1m is None or features_5m is None or features_15m is None:
        logger.error("Failed to create features for all timeframes.")
        raise ValueError("Could not create features for training")

    # Create the environment
    if env_class:
        # Use provided environment class
        env = env_class(features_1m, features_5m, features_15m)
    else:
        # Use the default environment
        env = RLTradingEnvironment(features_1m, features_5m, features_15m)

    # Set action callback if provided
    if action_callback:
        env.set_action_callback(action_callback)

    # Get environment properties for agent creation
    input_shape = env.observation_space.shape
    n_actions = env.action_space.n

    # Create the agent
    agent = DQNAgent(
        state_shape=input_shape,
        n_actions=n_actions,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        learning_rate=0.0001,
        gamma=0.99,
        buffer_size=10000,
        batch_size=64,
        device=device  # Pass device to agent for GPU usage
    )

    # Check if model file exists and load it
    model_file = f"{save_path}_model.pth"
    if os.path.exists(model_file):
        try:
            agent.load(model_file)
            logger.info(f"Loaded existing model from {model_file}")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
    else:
        logger.info("No existing model found. Starting with a new model.")

    # Create TensorBoard writer
    writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')

    # Log GPU status to TensorBoard
    writer.add_text("hardware/device", str(device), 0)
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)

    # Training loop
    total_rewards = []
    trade_win_rates = []
    best_reward = -np.inf

    # Move models to the appropriate device if not already there
    agent.move_models_to_device(device)

    # Enable mixed precision if GPU and feature is available
    use_mixed_precision = False
    if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
        logger.info("Enabling mixed precision training")
        use_mixed_precision = True
        scaler = torch.cuda.amp.GradScaler()

    # Define step callback for tensorboard logging and model tracking
    def step_callback(action, price, reward, info):
        # Pass to external callback if provided
        if action_callback:
            action_callback(env.current_step, action, price, reward, info)

    # Main training loop
    logger.info(f"Starting training for {num_episodes} episodes...")
    logger.info(f"Starting training on device: {agent.device}")

    try:
        for episode in range(num_episodes):
            state = env.reset()
            total_reward = 0

            for step in range(max_steps):
                # Select action
                action = agent.act(state)

                # Take action and observe next state and reward
                next_state, reward, done, info = env.step(action)

                # Store the experience in memory
                agent.remember(state, action, reward, next_state, done)

                # Update state and reward
                state = next_state
                total_reward += reward

                # Train the agent by sampling from memory
                if len(agent.memory) >= agent.batch_size:
                    loss = agent.replay()

                if done or step == max_steps - 1:
                    break

            # Track rewards
            total_rewards.append(total_reward)

            # Calculate trading metrics
            win_rate = env.win_rate if hasattr(env, 'win_rate') else 0
            trades = env.trades if hasattr(env, 'trades') else 0

            # Log to TensorBoard
            writer.add_scalar('Reward/Episode', total_reward, episode)
            writer.add_scalar('Trade/WinRate', win_rate, episode)
            writer.add_scalar('Trade/Count', trades, episode)

            # Save best model
            if total_reward > best_reward and episode > 10:
                logger.info(f"New best average reward: {total_reward:.4f}, saving model")
                agent.save(save_path)
                best_reward = total_reward

            # Periodic save every 100 episodes
            if episode % 100 == 0 and episode > 0:
                agent.save(f"{save_path}_episode_{episode}")

            # Call episode callback if provided
            if episode_callback:
                # Add environment to info dict to use for extrema training
                info_with_env = info.copy()
                info_with_env['env'] = env
                episode_callback(episode, total_reward, info_with_env)

        # Final save
        logger.info("Training completed, saving final model")
        agent.save(f"{save_path}_final")

    except Exception as e:
        logger.error(f"Training failed: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())

    # Close TensorBoard writer
    writer.close()

    return agent, env

if __name__ == "__main__":
    train_rl()