gogo2/NN/train_rl.py

import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import logging
import time
from datetime import datetime
import os
import sys
import pandas as pd
import gym
import json
import random
import torch.nn as nn
import contextlib

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from NN.utils.data_interface import DataInterface
from NN.utils.trading_env import TradingEnvironment
from NN.models.dqn_agent import DQNAgent
from NN.utils.signal_interpreter import SignalInterpreter

# Configure logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('rl_training.log'),
        logging.StreamHandler()
    ]
)

# Set up device for PyTorch (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Log GPU status
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
    logger.info(f"Using GPU: {gpu_names}")

    # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
    if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
        logger.info("BFloat16 precision is supported - will use for faster training")
else:
    logger.warning("GPU not available. Using CPU for training (slower).")

class RLTradingEnvironment(gym.Env):
    """
    Reinforcement Learning environment for trading with technical indicators
    from multiple timeframes
    """
    def __init__(self, features_1m, features_1h, features_1d, window_size=20, trading_fee=0.0025, min_trade_interval=15):
        super().__init__()

        # Initialize attributes before parent class
        self.window_size = window_size
        self.num_features = features_1m.shape[1] - 1  # Exclude close price

        # Count available timeframes
        self.num_timeframes = 3  # We require all timeframes now
        self.feature_dim = self.num_features * self.num_timeframes

        # Store features from different timeframes
        self.features_1m = features_1m
        self.features_1h = features_1h
        self.features_1d = features_1d

        # Trading parameters
        self.initial_balance = 1.0
        self.trading_fee = trading_fee  # Increased from 0.001 to 0.0025 (0.25%)
        self.min_trade_interval = min_trade_interval  # Minimum steps between trades

        # Define action and observation spaces
        self.action_space = gym.spaces.Discrete(3)  # 0: Buy, 1: Sell, 2: Hold
        self.observation_space = gym.spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size, self.feature_dim),
            dtype=np.float32
        )

        # State variables
        self.reset()

        # Callback for visualization or external monitoring
        self.action_callback = None

    def reset(self):
        """Reset the environment to initial state"""
        self.balance = self.initial_balance
        self.position = 0.0  # Amount of asset held
        self.current_step = self.window_size
        self.trades = 0
        self.wins = 0
        self.losses = 0
        self.trade_history = []
        self.last_trade_step = -self.min_trade_interval  # Initialize to allow immediate first trade

        # Get initial observation
        observation = self._get_observation()
        return observation

    def _get_observation(self):
        """
        Get the current state observation.
        Combine features from multiple timeframes, reshaped for the CNN.
        """
        # Calculate indices for each timeframe
        idx_1m = min(self.current_step, self.features_1m.shape[0] - 1)
        idx_1h = idx_1m // 60  # 60 minutes in an hour
        idx_1d = idx_1h // 24  # 24 hours in a day

        # Cap indices to prevent out of bounds
        idx_1h = min(idx_1h, self.features_1h.shape[0] - 1)
        idx_1d = min(idx_1d, self.features_1d.shape[0] - 1)

        # Extract feature windows from each timeframe
        window_1m = self.features_1m[max(0, idx_1m - self.window_size):idx_1m]

        # Handle hourly timeframe
        start_1h = max(0, idx_1h - self.window_size)
        window_1h = self.features_1h[start_1h:idx_1h]

        # Handle daily timeframe
        start_1d = max(0, idx_1d - self.window_size)
        window_1d = self.features_1d[start_1d:idx_1d]

        # Pad if needed (for higher timeframes)
        if len(window_1m) < self.window_size:
            padding = np.zeros((self.window_size - len(window_1m), window_1m.shape[1]))
            window_1m = np.vstack([padding, window_1m])

        if len(window_1h) < self.window_size:
            padding = np.zeros((self.window_size - len(window_1h), window_1h.shape[1]))
            window_1h = np.vstack([padding, window_1h])

        if len(window_1d) < self.window_size:
            padding = np.zeros((self.window_size - len(window_1d), window_1d.shape[1]))
            window_1d = np.vstack([padding, window_1d])

        # Combine features from all timeframes
        combined_features = np.hstack([
            window_1m.reshape(self.window_size, -1),
            window_1h.reshape(self.window_size, -1),
            window_1d.reshape(self.window_size, -1)
        ])

        # Convert to float32 and handle any NaN values
        combined_features = np.nan_to_num(combined_features, nan=0.0).astype(np.float32)

        return combined_features

    def step(self, action):
        """Take an action and return the next state, reward, done flag, and info"""
        # Initialize info dictionary for additional data
        info = {
            'trade_executed': False,
            'price_change': 0.0,
            'position_change': 0,
            'current_price': 0.0,
            'next_price': 0.0,
            'balance_change': 0.0,
            'reward_components': {},
            'future_prices': {}
        }

        # Get the current and next price
        current_price = self.features_1m[self.current_step, -1]

        # Handle edge case at the end of the data
        if self.current_step >= len(self.features_1m) - 1:
            next_price = current_price  # Use current price as next price
            done = True
        else:
            next_price = self.features_1m[self.current_step + 1, -1]
            done = False

        # Handle zero or negative price (data error)
        if current_price <= 0:
            current_price = 0.01  # Set to a small positive number
            logger.warning(f"Zero or negative price detected at step {self.current_step}. Setting to 0.01.")

        if next_price <= 0:
            next_price = current_price  # Use current price instead
            logger.warning(f"Zero or negative next price detected at step {self.current_step + 1}. Using current price.")

        # Calculate price change as percentage
        price_change_pct = ((next_price - current_price) / current_price) * 100

        # Store prices in info
        info['current_price'] = current_price
        info['next_price'] = next_price
        info['price_change'] = price_change_pct

        # Initialize reward components dictionary
        reward_components = {
            'holding_reward': 0.0,
            'action_reward': 0.0,
            'profit_reward': 0.0,
            'trade_freq_penalty': 0.0
        }

        # Default small negative reward to discourage inaction
        reward = -0.01
        reward_components['holding_reward'] = -0.01

        # Track previous balance for changes
        previous_balance = self.balance

        # Execute action (0: Buy, 1: Sell, 2: Hold)
        if action == 0:  # Buy
            if self.position == 0:  # Only buy if we don't already have a position
                # Calculate how much of the asset we can buy with 100% of balance
                self.position = self.balance / current_price
                self.balance = 0  # All balance used

                # If price goes up after buying, that's good
                expected_profit = price_change_pct
                # Scale reward based on expected profit
                if expected_profit > 0:
                    # Positive reward for profitable buy decision
                    action_reward = 0.1 + (expected_profit * 0.05)  # Base reward + profit-based bonus
                    reward_components['action_reward'] = action_reward
                    reward += action_reward
                else:
                    # Small negative reward for unprofitable buy
                    action_reward = -0.1 + (expected_profit * 0.03)  # Smaller penalty for small losses
                    reward_components['action_reward'] = action_reward
                    reward += action_reward

                # Check if we've traded too frequently
                if len(self.trade_history) > 0:
                    last_trade_step = self.trade_history[-1]['step']
                    if self.current_step - last_trade_step < 5:  # If less than 5 steps since last trade
                        freq_penalty = -0.2  # Penalty for trading too frequently
                        reward += freq_penalty
                        reward_components['trade_freq_penalty'] = freq_penalty

                # Record the trade
                self.trade_history.append({
                    'step': self.current_step,
                    'action': 'buy',
                    'price': current_price,
                    'position': self.position,
                    'balance': self.balance
                })

                info['trade_executed'] = True
                logger.info(f"Buy at step {self.current_step}, price: {current_price:.4f}, position: {self.position:.6f}")

        elif action == 1:  # Sell
            if self.position > 0:  # Only sell if we have a position
                # Calculate sale proceeds
                sale_value = self.position * current_price

                # Calculate profit or loss percentage from last buy
                last_buy_price = None
                for trade in reversed(self.trade_history):
                    if trade['action'] == 'buy':
                        last_buy_price = trade['price']
                        break

                # If we found the last buy price, calculate profit
                if last_buy_price is not None:
                    profit_pct = ((current_price - last_buy_price) / last_buy_price) * 100

                    # Highly reward profitable trades
                    if profit_pct > 0:
                        # Progressive reward based on profit percentage
                        profit_reward = min(5.0, profit_pct * 0.2)  # Cap at 5.0 to prevent exploitation
                        reward_components['profit_reward'] = profit_reward
                        reward += profit_reward
                        logger.info(f"Profitable trade! {profit_pct:.2f}% profit, reward: {profit_reward:.4f}")
                    else:
                        # Penalize losses more heavily based on size of loss
                        loss_penalty = max(-3.0, profit_pct * 0.15)  # Cap at -3.0 to prevent excessive punishment
                        reward_components['profit_reward'] = loss_penalty
                        reward += loss_penalty
                        logger.info(f"Loss trade! {profit_pct:.2f}% loss, penalty: {loss_penalty:.4f}")

                # If price goes down after selling, that's good
                if price_change_pct < 0:
                    # Reward for good timing on sell (avoiding future loss)
                    timing_reward = min(1.0, abs(price_change_pct) * 0.05)
                    reward_components['action_reward'] = timing_reward
                    reward += timing_reward

                # Check for trading too frequently
                if len(self.trade_history) > 0:
                    last_trade_step = self.trade_history[-1]['step']
                    if self.current_step - last_trade_step < 5:  # If less than 5 steps since last trade
                        freq_penalty = -0.2  # Penalty for trading too frequently
                        reward += freq_penalty
                        reward_components['trade_freq_penalty'] = freq_penalty

                # Update balance and position
                self.balance = sale_value
                position_change = self.position
                self.position = 0

                # Record the trade
                self.trade_history.append({
                    'step': self.current_step,
                    'action': 'sell',
                    'price': current_price,
                    'position': self.position,
                    'balance': self.balance
                })

                info['trade_executed'] = True
                info['position_change'] = position_change
                logger.info(f"Sell at step {self.current_step}, price: {current_price:.4f}, new balance: {self.balance:.4f}")

        elif action == 2:  # Hold
            # Small reward if holding was a good decision
            if self.position > 0 and price_change_pct > 0:  # Holding long position during price increase
                hold_reward = price_change_pct * 0.01  # Small reward proportional to price increase
                reward += hold_reward
                reward_components['holding_reward'] = hold_reward
            elif self.position == 0 and price_change_pct < 0:  # Holding cash during price decrease
                hold_reward = abs(price_change_pct) * 0.01  # Small reward for avoiding loss
                reward += hold_reward
                reward_components['holding_reward'] = hold_reward

        # Move to the next step
        self.current_step += 1

        # Update current portfolio value
        if self.position > 0:
            self.current_value = self.balance + (self.position * next_price)
        else:
            self.current_value = self.balance

        # Calculate balance change
        balance_change = self.current_value - previous_balance
        info['balance_change'] = balance_change

        # Check if we've reached the end of the data
        if self.current_step >= len(self.features_1m) - 1:
            done = True

            # Final evaluation if we have a position
            if self.position > 0:
                # Sell remaining position at the final price
                final_balance = self.balance + (self.position * next_price)

                # Calculate final portfolio value and return
                final_return_pct = ((final_balance - self.initial_balance) / self.initial_balance) * 100

                # Add big reward/penalty based on overall performance
                performance_reward = final_return_pct * 0.1
                reward += performance_reward
                reward_components['final_performance'] = performance_reward

                logger.info(f"Episode ended. Final balance: {final_balance:.4f}, Return: {final_return_pct:.2f}%")

        # Get future prices for evaluation (1-hour and 1-day ahead)
        info['future_prices'] = {}

        # 1-hour future price if hourly data is available
        if hasattr(self, 'features_1h') and self.features_1h is not None:
            # Find the closest hourly data point
            if self.current_step < len(self.features_1m):
                current_time = self.current_step  # Use as index for simplicity
                hourly_idx = min(current_time // 60, len(self.features_1h) - 1)  # Assuming 60 minutes per hour
                if hourly_idx < len(self.features_1h) - 1:
                    future_1h_price = self.features_1h[hourly_idx + 1, -1]
                    info['future_prices']['1h'] = future_1h_price

        # 1-day future price if daily data is available
        if hasattr(self, 'features_1d') and self.features_1d is not None:
            # Find the closest daily data point
            if self.current_step < len(self.features_1m):
                current_time = self.current_step  # Use as index for simplicity
                daily_idx = min(current_time // 1440, len(self.features_1d) - 1)  # Assuming 1440 minutes per day
                if daily_idx < len(self.features_1d) - 1:
                    future_1d_price = self.features_1d[daily_idx + 1, -1]
                    info['future_prices']['1d'] = future_1d_price

        # Get next observation
        next_state = self._get_observation()

        # Store reward components in info
        info['reward_components'] = reward_components

        # Clip reward to prevent extreme values
        reward = np.clip(reward, -10.0, 10.0)

        return next_state, reward, done, info

    def set_action_callback(self, callback):
        """
        Set a callback function to be called after each action

        Args:
            callback: Function with signature (action, price, reward, info)
        """
        self.action_callback = callback

def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent",
             action_callback=None, episode_callback=None, symbol="BTC/USDT",
             pretrain_price_prediction_enabled=False, pretrain_epochs=10):
    """
    Train a reinforcement learning agent for trading using ONLY real market data

    Args:
        env_class: Optional environment class override
        num_episodes: Number of episodes to train for
        max_steps: Maximum steps per episode
        save_path: Path to save the trained model
        action_callback: Callback function for monitoring actions
        episode_callback: Callback function for monitoring episodes
        symbol: Trading symbol to use
        pretrain_price_prediction_enabled: DEPRECATED - No longer supported (synthetic data not used)
        pretrain_epochs: DEPRECATED - No longer supported (synthetic data not used)

    Returns:
        tuple: (trained agent, environment)
    """
    # Load data for the selected symbol
    data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m', '1h', '1d'])

    try:
        # Try to load data for the requested symbol using get_historical_data method
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
        data_1h = data_interface.get_historical_data(timeframe='1h', n_candles=1000)
        data_1d = data_interface.get_historical_data(timeframe='1d', n_candles=500)

        if data_1m is None or data_5m is None or data_15m is None or data_1h is None or data_1d is None:
            raise FileNotFoundError("Could not retrieve all required timeframes data for specified symbol")
    except Exception as e:
        logger.warning(f"Data for {symbol} not available: {str(e)}. Using default cached data.")
        # Try to use cached data if available
        symbol = "BTC/USDT"
        data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m', '1h', '1d'])
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
        data_1h = data_interface.get_historical_data(timeframe='1h', n_candles=1000)
        data_1d = data_interface.get_historical_data(timeframe='1d', n_candles=500)

        if data_1m is None or data_5m is None or data_15m is None or data_1h is None or data_1d is None:
            logger.error("Failed to retrieve all required timeframes data. Cannot continue training.")
            raise ValueError("No data available for training")

    # Create features from the data by adding technical indicators and converting to numpy format
    if data_1m is not None:
        data_1m = data_interface.add_technical_indicators(data_1m)
        # Convert to numpy array with close price as the last column
        features_1m = np.hstack([
            data_1m.drop(['timestamp', 'close'], axis=1).values,
            data_1m['close'].values.reshape(-1, 1)
        ])
    else:
        features_1m = None

    if data_5m is not None:
        data_5m = data_interface.add_technical_indicators(data_5m)
        # Convert to numpy array with close price as the last column
        features_5m = np.hstack([
            data_5m.drop(['timestamp', 'close'], axis=1).values,
            data_5m['close'].values.reshape(-1, 1)
        ])
    else:
        features_5m = None

    if data_15m is not None:
        data_15m = data_interface.add_technical_indicators(data_15m)
        # Convert to numpy array with close price as the last column
        features_15m = np.hstack([
            data_15m.drop(['timestamp', 'close'], axis=1).values,
            data_15m['close'].values.reshape(-1, 1)
        ])
    else:
        features_15m = None

    if data_1h is not None:
        data_1h = data_interface.add_technical_indicators(data_1h)
        # Convert to numpy array with close price as the last column
        features_1h = np.hstack([
            data_1h.drop(['timestamp', 'close'], axis=1).values,
            data_1h['close'].values.reshape(-1, 1)
        ])
    else:
        features_1h = None

    if data_1d is not None:
        data_1d = data_interface.add_technical_indicators(data_1d)
        # Convert to numpy array with close price as the last column
        features_1d = np.hstack([
            data_1d.drop(['timestamp', 'close'], axis=1).values,
            data_1d['close'].values.reshape(-1, 1)
        ])
    else:
        features_1d = None

    # Check if we have all the required features
    if features_1m is None or features_5m is None or features_15m is None or features_1h is None or features_1d is None:
        logger.error("Failed to create features for all timeframes.")
        raise ValueError("Could not create features for training")

    # Create the environment
    if env_class:
        # Use provided environment class
        env = env_class(features_1m, features_1h, features_1d)
    else:
        # Use the default environment
        env = RLTradingEnvironment(features_1m, features_1h, features_1d)

    # Set action callback if provided
    if action_callback:
        env.set_action_callback(action_callback)

    # Get environment properties for agent creation
    input_shape = env.observation_space.shape
    n_actions = env.action_space.n

    # Create the agent
    agent = DQNAgent(
        state_shape=input_shape,
        n_actions=n_actions,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        learning_rate=0.0001,
        gamma=0.99,
        buffer_size=10000,
        batch_size=64,
        device=device  # Pass device to agent for GPU usage
    )

    # Check if model file exists and load it
    model_file = f"{save_path}_model.pth"
    if os.path.exists(model_file):
        try:
            agent.load(model_file)
            logger.info(f"Loaded existing model from {model_file}")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
    else:
        logger.info("No existing model found. Starting with a new model.")

    # Remove pre-training code since it used synthetic data
    # Pre-training with real data would require a separate implementation
    if pretrain_price_prediction_enabled:
        logger.warning("Pre-training with synthetic data is no longer supported. Continuing with RL training only.")

    # Create TensorBoard writer
    writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')

    # Log GPU status to TensorBoard
    writer.add_text("hardware/device", str(device), 0)
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)

    # Training loop
    total_rewards = []
    trade_win_rates = []
    best_reward = -np.inf

    # Move models to the appropriate device if not already there
    agent.move_models_to_device(device)

    # Enable mixed precision if GPU and feature is available
    use_mixed_precision = False
    if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
        logger.info("Enabling mixed precision training")
        use_mixed_precision = True
        scaler = torch.cuda.amp.GradScaler()

    # Define step callback for tensorboard logging and model tracking
    def step_callback(action, price, reward, info):
        # Pass to external callback if provided
        if action_callback:
            action_callback(env.current_step, action, price, reward, info)

    # Main training loop
    logger.info(f"Starting training for {num_episodes} episodes...")
    logger.info(f"Starting training on device: {agent.device}")

    try:
        for episode in range(num_episodes):
            state = env.reset()
            total_reward = 0

            for step in range(max_steps):
                # Select action
                action = agent.act(state)

                # Take action and observe next state and reward
                next_state, reward, done, info = env.step(action)

                # Store the experience in memory
                agent.remember(state, action, reward, next_state, done)

                # Update state and reward
                state = next_state
                total_reward += reward

                # Train the agent by sampling from memory
                if len(agent.memory) >= agent.batch_size:
                    loss = agent.replay()

                if done or step == max_steps - 1:
                    break

            # Track rewards
            total_rewards.append(total_reward)

            # Calculate trading metrics
            win_rate = env.wins / max(1, env.trades)
            trades = env.trades

            # Log to TensorBoard
            writer.add_scalar('Reward/Episode', total_reward, episode)
            writer.add_scalar('Trade/WinRate', win_rate, episode)
            writer.add_scalar('Trade/Count', trades, episode)

            # Save best model
            if total_reward > best_reward and episode > 10:
                logger.info(f"New best average reward: {total_reward:.4f}, saving model")
                agent.save(save_path)
                best_reward = total_reward

            # Periodic save every 100 episodes
            if episode % 100 == 0 and episode > 0:
                agent.save(f"{save_path}_episode_{episode}")

            # Call episode callback if provided
            if episode_callback:
                # Add environment to info dict to use for extrema training
                info_with_env = info.copy()
                info_with_env['env'] = env
                episode_callback(episode, total_reward, info_with_env)

        # Final save
        logger.info("Training completed, saving final model")
        agent.save(f"{save_path}_final")

    except Exception as e:
        logger.error(f"Training failed: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())

    # Close TensorBoard writer
    writer.close()

    return agent, env

if __name__ == "__main__":
    train_rl()