gogo2/NN/utils/data_interface.py

"""
Data Interface for Neural Network Trading System

This module provides functionality to fetch, process, and prepare data for the neural network models.
"""

import os
import logging
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import json
import pickle
from sklearn.preprocessing import MinMaxScaler

logger = logging.getLogger(__name__)

class DataInterface:
    """
    Enhanced Data Interface supporting:
    - Multiple trading pairs (up to 3)
    - Multiple timeframes per pair (1s, 1m, 1h, 1d + custom)
    - Technical indicators (up to 20)
    - Cross-timeframe normalization
    - Real-time tick streaming
    """

    def __init__(self, symbol=None, timeframes=None, data_dir="NN/data"):
        """
        Initialize the data interface.

        Args:
            symbol (str): Trading pair symbol (e.g., "BTC/USDT")
            timeframes (list): List of timeframes to use (e.g., ['1m', '5m', '1h', '4h', '1d'])
            data_dir (str): Directory to store/load datasets
        """
        self.symbol = symbol
        self.timeframes = timeframes or ['1h', '4h', '1d']
        self.data_dir = data_dir
        self.scalers = {}  # Store scalers for each timeframe

        # Create data directory if it doesn't exist
        os.makedirs(self.data_dir, exist_ok=True)

        # Initialize empty dataframes for each timeframe
        self.dataframes = {tf: None for tf in self.timeframes}

        logger.info(f"DataInterface initialized for {symbol} with timeframes {timeframes}")

    def get_historical_data(self, timeframe='1h', n_candles=1000, use_cache=True):
        """
        Fetch historical price data for a given timeframe.

        Args:
            timeframe (str): Timeframe to fetch data for
            n_candles (int): Number of candles to fetch
            use_cache (bool): Whether to use cached data if available

        Returns:
            pd.DataFrame: DataFrame with OHLCV data
        """
        cache_file = os.path.join(self.data_dir, f"{self.symbol.replace('/', '_')}_{timeframe}.csv")

        # Check if cached data exists and is recent
        if use_cache and os.path.exists(cache_file):
            try:
                df = pd.read_csv(cache_file, parse_dates=['timestamp'])
                # If we have enough data and it's recent, use it
                if len(df) >= n_candles:
                    logger.info(f"Using cached data for {self.symbol} {timeframe} ({len(df)} candles)")
                    self.dataframes[timeframe] = df
                    return df.tail(n_candles)
            except Exception as e:
                logger.error(f"Error reading cached data: {str(e)}")

        # If we get here, we need to fetch data
        # For now, we'll use a placeholder for fetching data from an exchange
        try:
            # In a real implementation, we would fetch data from an exchange or API here
            # For this example, we'll create dummy data if we can't load from cache
            logger.info(f"Fetching historical data for {self.symbol} {timeframe}")

            # Placeholder for real data fetching
            # In a real implementation, this would be replaced with API calls
            self._fetch_data_from_exchange(timeframe, n_candles)

            # Save to cache
            if self.dataframes[timeframe] is not None:
                self.dataframes[timeframe].to_csv(cache_file, index=False)
                return self.dataframes[timeframe]
            else:
                # Create dummy data as fallback
                logger.warning(f"Could not fetch data for {self.symbol} {timeframe}, using dummy data")
                df = self._create_dummy_data(timeframe, n_candles)
                self.dataframes[timeframe] = df
                return df
        except Exception as e:
            logger.error(f"Error fetching data: {str(e)}")
            return None

    def _fetch_data_from_exchange(self, timeframe, n_candles):
        """
        Placeholder method for fetching data from an exchange.
        In a real implementation, this would connect to an exchange API.
        """
        # This is a placeholder - in a real implementation this would make API calls
        # to a cryptocurrency exchange to fetch OHLCV data

        # For now, just generate dummy data
        self.dataframes[timeframe] = self._create_dummy_data(timeframe, n_candles)

    def _create_dummy_data(self, timeframe, n_candles):
        """
        Create dummy OHLCV data for testing purposes.

        Args:
            timeframe (str): Timeframe to create data for
            n_candles (int): Number of candles to create

        Returns:
            pd.DataFrame: DataFrame with dummy OHLCV data
        """
        # Map timeframe to seconds
        tf_seconds = {
            '1m': 60,
            '5m': 300,
            '15m': 900,
            '1h': 3600,
            '4h': 14400,
            '1d': 86400
        }
        seconds = tf_seconds.get(timeframe, 3600)  # Default to 1h

        # Create timestamps
        end_time = datetime.now()
        timestamps = [end_time - timedelta(seconds=seconds * i) for i in range(n_candles)]
        timestamps.reverse()  # Oldest first

        # Generate random price data with realistic patterns
        np.random.seed(42)  # For reproducibility

        # Start price
        price = 50000  # For BTC/USDT
        prices = []
        volumes = []

        for i in range(n_candles):
            # Random walk with drift and volatility based on timeframe
            drift = 0.0001 * seconds  # Larger drift for larger timeframes
            volatility = 0.01 * np.sqrt(seconds / 3600)  # Scale volatility by sqrt of time

            # Daily/weekly patterns
            if timeframe in ['1d', '4h']:
                # Add some cyclical patterns
                cycle = np.sin(i / 7 * np.pi) * 0.02  # Weekly cycle
            else:
                cycle = np.sin(i / 24 * np.pi) * 0.01  # Daily cycle

            # Calculate price change with random walk + cycles (clamped to prevent overflow)
            price_change = price * np.clip(drift + volatility * np.random.randn() + cycle, -0.1, 0.1)
            price = np.clip(price + price_change, 1000, 100000)  # Keep price in reasonable range

            # Generate OHLC from the price
            open_price = price
            high_price = price * (1 + abs(0.005 * np.random.randn()))
            low_price = price * (1 - abs(0.005 * np.random.randn()))
            close_price = price * (1 + 0.002 * np.random.randn())

            # Ensure high >= open, close, low and low <= open, close
            high_price = max(high_price, open_price, close_price)
            low_price = min(low_price, open_price, close_price)

            # Generate volume (higher for larger price movements) with safe calculation
            volume = 10000 + 5000 * np.random.rand() + abs(price_change)/price * 10000

            prices.append((open_price, high_price, low_price, close_price))
            volumes.append(volume)

            # Update price for next iteration
            price = close_price

        # Create DataFrame
        df = pd.DataFrame(
            [(t, o, h, l, c, v) for t, (o, h, l, c), v in zip(timestamps, prices, volumes)],
            columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']
        )

        return df

    def prepare_nn_input(self, timeframes=None, n_candles=500, window_size=20):
        """
        Prepare input data for neural network models.

        Args:
            timeframes (list): List of timeframes to use
            n_candles (int): Number of candles to fetch for each timeframe
            window_size (int): Size of the sliding window for feature creation

        Returns:
            tuple: (X, y, timestamps) where:
                X is the input features array with shape (n_samples, window_size, n_features)
                y is the target array with shape (n_samples,)
                timestamps is an array of timestamps for each sample
        """
        if timeframes is None:
            timeframes = self.timeframes

        # Get data for all requested timeframes
        dfs = {}
        for tf in timeframes:
            df = self.get_historical_data(timeframe=tf, n_candles=n_candles)
            if df is not None and not df.empty:
                dfs[tf] = df

        if not dfs:
            logger.error("No data available for feature creation")
            return None, None, None

        # Create features for each timeframe
        features = []
        targets = []
        timestamps = []

        for tf in timeframes:
            if tf in dfs:
                X, y, ts = self._create_features(dfs[tf], window_size)
                if X is not None and y is not None:
                    features.append(X)
                    if len(targets) == 0:  # Only need targets from one timeframe
                        targets = y
                        timestamps = ts

        if not features:
            logger.error("Failed to create features for any timeframe")
            return None, None, None

        # Stack features from all timeframes along the time dimension
        # Reshape each timeframe's features to [samples, window, 1, features]
        reshaped_features = [f.reshape(f.shape[0], f.shape[1], 1, f.shape[2])
                           for f in features]
        # Concatenate along the channel dimension
        X = np.concatenate(reshaped_features, axis=2)
        # Reshape to [samples, window, features*timeframes]
        X = X.reshape(X.shape[0], X.shape[1], -1)

        # Validate data
        if np.any(np.isnan(X)) or np.any(np.isinf(X)):
            logger.error("Generated features contain NaN or infinite values")
            return None, None, None

        # Ensure all values are finite and normalized
        X = np.nan_to_num(X, nan=0.0, posinf=1.0, neginf=-1.0)
        X = np.clip(X, -1e6, 1e6)  # Clip extreme values

        # Log data shapes for debugging
        logger.info(f"Prepared input data - X shape: {X.shape}, y shape: {np.array(targets).shape}")

        return X, targets, timestamps

    def _create_features(self, df, window_size):
        """
        Create features from OHLCV data using a sliding window approach.

        Args:
            df (pd.DataFrame): DataFrame with OHLCV data
            window_size (int): Size of the sliding window

        Returns:
            tuple: (X, y, timestamps) where:
                X is the input features array
                y is the target array
                timestamps is an array of timestamps for each sample
        """
        # Extract OHLCV columns
        ohlcv = df[['open', 'high', 'low', 'close', 'volume']].values

        # Validate data before scaling
        if np.any(np.isnan(ohlcv)) or np.any(np.isinf(ohlcv)):
            logger.error("Input data contains NaN or infinite values")
            return None, None, None

        # Handle potential constant columns (avoid division by zero in scaler)
        ohlcv = np.nan_to_num(ohlcv, nan=0.0)
        ranges = np.ptp(ohlcv, axis=0)
        for i in range(len(ranges)):
            if ranges[i] == 0:  # Constant column
                ohlcv[:, i] = 1 if i == 3 else 0  # Set close to 1, others to 0

        # Scale the data with safety checks
        try:
            scaler = MinMaxScaler()
            ohlcv_scaled = scaler.fit_transform(ohlcv)
            if np.any(np.isnan(ohlcv_scaled)) or np.any(np.isinf(ohlcv_scaled)):
                logger.error("Scaling produced invalid values")
                return None, None, None
        except Exception as e:
            logger.error(f"Scaling failed: {str(e)}")
            return None, None, None

        # Store the scaler for later use
        timeframe = next((tf for tf in self.timeframes if self.dataframes.get(tf) is not None and
                         self.dataframes[tf].equals(df)), 'unknown')
        self.scalers[timeframe] = scaler

        # Create sliding windows
        X = []
        y = []
        timestamps = []

        for i in range(len(ohlcv_scaled) - window_size):
            # Input: window_size candles of OHLCV data
            window = ohlcv_scaled[i:i+window_size]

            # Validate window data
            if np.any(np.isnan(window)) or np.any(np.isinf(window)):
                continue

            X.append(window)

            # Target: binary classification - price goes up (1) or down (0)
            # 1 if close price increases in the next candle, 0 otherwise
            price_change = ohlcv[i+window_size, 3] - ohlcv[i+window_size-1, 3]
            y.append(1 if price_change > 0 else 0)

            # Store timestamp for reference
            timestamps.append(df['timestamp'].iloc[i+window_size])

        if not X:
            logger.error("No valid windows created")
            return None, None, None

        X = np.array(X)
        y = np.array(y)
        timestamps = np.array(timestamps)

        # Log shapes for debugging
        logger.info(f"Created features - X shape: {X.shape}, y shape: {y.shape}")

        return X, y, timestamps

    def generate_training_dataset(self, timeframes=None, n_candles=1000, window_size=20):
        """
        Generate and save a training dataset for neural network models.

        Args:
            timeframes (list): List of timeframes to use
            n_candles (int): Number of candles to fetch for each timeframe
            window_size (int): Size of the sliding window for feature creation

        Returns:
            dict: Dictionary of dataset file paths
        """
        if timeframes is None:
            timeframes = self.timeframes

        # Prepare inputs
        X, y, timestamps = self.prepare_nn_input(timeframes, n_candles, window_size)

        if X is None or y is None:
            logger.error("Failed to prepare input data for dataset")
            return None

        # Prepare output paths
        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        dataset_name = f"{self.symbol.replace('/', '_')}_{'_'.join(timeframes)}_{timestamp_str}"

        X_path = os.path.join(self.data_dir, f"{dataset_name}_X.npy")
        y_path = os.path.join(self.data_dir, f"{dataset_name}_y.npy")
        timestamps_path = os.path.join(self.data_dir, f"{dataset_name}_timestamps.npy")
        metadata_path = os.path.join(self.data_dir, f"{dataset_name}_metadata.json")

        # Save arrays
        np.save(X_path, X)
        np.save(y_path, y)
        np.save(timestamps_path, timestamps)

        # Save metadata
        metadata = {
            'symbol': self.symbol,
            'timeframes': timeframes,
            'window_size': window_size,
            'n_samples': len(X),
            'feature_shape': X.shape[1:],
            'created_at': datetime.now().isoformat(),
            'dataset_name': dataset_name
        }

        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        # Save scalers
        scaler_path = os.path.join(self.data_dir, f"{dataset_name}_scalers.pkl")
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scalers, f)

        # Return dataset info
        dataset_info = {
            'X_path': X_path,
            'y_path': y_path,
            'timestamps_path': timestamps_path,
            'metadata_path': metadata_path,
            'scaler_path': scaler_path
        }

        logger.info(f"Dataset generated and saved: {dataset_name}")
        return dataset_info

    def get_feature_count(self):
        """Get the number of features per input sample"""
        # OHLCV (5 features) per timeframe
        return 5 * len(self.timeframes)

    def calculate_pnl(self, predictions, actual_prices, position_size=1.0):
        """
        Calculate PnL based on predictions and actual price movements.

        Args:
            predictions (np.array): Model predictions (0: sell, 1: hold, 2: buy)
            actual_prices (np.array): Actual price data
            position_size (float): Size of the position to trade

        Returns:
            tuple: (total_pnl, win_rate, trade_history)
        """
        if len(predictions) != len(actual_prices) - 1:
            logger.error("Predictions and prices length mismatch")
            return 0.0, 0.0, []

        pnl = 0.0
        trades = 0
        wins = 0
        trade_history = []

        for i in range(len(predictions)):
            pred = predictions[i]
            current_price = actual_prices[i]
            next_price = actual_prices[i + 1]

            # Calculate price change percentage
            price_change = (next_price - current_price) / current_price

            # Calculate PnL based on prediction
            if pred == 2:  # Buy
                trade_pnl = price_change * position_size
                trades += 1
                if trade_pnl > 0:
                    wins += 1
                trade_history.append({
                    'type': 'buy',
                    'price': current_price,
                    'pnl': trade_pnl,
                    'timestamp': self.dataframes[self.timeframes[0]]['timestamp'].iloc[i]
                })
            elif pred == 0:  # Sell
                trade_pnl = -price_change * position_size
                trades += 1
                if trade_pnl > 0:
                    wins += 1
                trade_history.append({
                    'type': 'sell',
                    'price': current_price,
                    'pnl': trade_pnl,
                    'timestamp': self.dataframes[self.timeframes[0]]['timestamp'].iloc[i]
                })

            pnl += trade_pnl if pred in [0, 2] else 0

        win_rate = wins / trades if trades > 0 else 0.0
        return pnl, win_rate, trade_history

    def prepare_training_data(self, refresh=False, refresh_interval=300):
        """
        Prepare training and validation data with optional refresh.

        Args:
            refresh (bool): Whether to force refresh data
            refresh_interval (int): Minimum seconds between refreshes

        Returns:
            tuple: (X_train, y_train, X_val, y_val, prices) numpy arrays
        """
        current_time = datetime.now()
        if refresh or (current_time - getattr(self, 'last_refresh', datetime.min)).total_seconds() > refresh_interval:
            logger.info("Refreshing training data...")
            for tf in self.timeframes:
                self.get_historical_data(timeframe=tf, n_candles=1000, use_cache=False)
            self.last_refresh = current_time

        # Get all data
        X, y, _ = self.prepare_nn_input()
        if X is None:
            return None, None, None, None, None

        # Get price data for PnL calculation
        prices = self.dataframes[self.timeframes[0]]['close'].values

        # Split into train/validation (80/20)
        split_idx = int(len(X) * 0.8)
        return (X[:split_idx], y[:split_idx], X[split_idx:], y[split_idx:],
                prices[:split_idx], prices[split_idx:])

    def prepare_realtime_input(self, timeframe='1h', n_candles=30, window_size=20):
        """
        Prepare a single input sample from the most recent data for real-time inference.

        Args:
            timeframe (str): Timeframe to use
            n_candles (int): Number of recent candles to fetch
            window_size (int): Size of the sliding window

        Returns:
            tuple: (X, timestamp) where:
                X is the input features array with shape (1, window_size, n_features)
                timestamp is the timestamp of the most recent candle
        """
        # Get recent data
        df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles, use_cache=False)

        if df is None or len(df) < window_size:
            logger.error(f"Not enough data for inference (need at least {window_size} candles)")
            return None, None

        # Extract features from the most recent window
        ohlcv = df[['open', 'high', 'low', 'close', 'volume']].tail(window_size).values

        # Scale the data
        if timeframe in self.scalers:
            # Use existing scaler
            scaler = self.scalers[timeframe]
        else:
            # Create new scaler
            scaler = MinMaxScaler()
            # Fit on all available data
            all_data = df[['open', 'high', 'low', 'close', 'volume']].values
            scaler.fit(all_data)
            self.scalers[timeframe] = scaler

        ohlcv_scaled = scaler.transform(ohlcv)

        # Reshape to (1, window_size, n_features)
        X = np.array([ohlcv_scaled])

        # Get timestamp of the most recent candle
        timestamp = df['timestamp'].iloc[-1]

        return X, timestamp