""" Data Interface for Neural Network Trading System This module provides functionality to fetch, process, and prepare data for the neural network models. """ import os import logging import numpy as np import pandas as pd from datetime import datetime, timedelta import json import pickle from sklearn.preprocessing import MinMaxScaler logger = logging.getLogger(__name__) class DataInterface: """ Handles data collection, processing, and preparation for neural network models. This class is responsible for: 1. Fetching historical data 2. Preprocessing data for neural network input 3. Generating training datasets 4. Handling real-time data integration """ def __init__(self, symbol="BTC/USDT", timeframes=None, data_dir="NN/data"): """ Initialize the data interface. Args: symbol (str): Trading pair symbol (e.g., "BTC/USDT") timeframes (list): List of timeframes to use (e.g., ['1m', '5m', '1h', '4h', '1d']) data_dir (str): Directory to store/load datasets """ self.symbol = symbol self.timeframes = timeframes or ['1h', '4h', '1d'] self.data_dir = data_dir self.scalers = {} # Store scalers for each timeframe # Create data directory if it doesn't exist os.makedirs(self.data_dir, exist_ok=True) # Initialize empty dataframes for each timeframe self.dataframes = {tf: None for tf in self.timeframes} logger.info(f"DataInterface initialized for {symbol} with timeframes {timeframes}") def get_historical_data(self, timeframe='1h', n_candles=1000, use_cache=True): """ Fetch historical price data for a given timeframe. Args: timeframe (str): Timeframe to fetch data for n_candles (int): Number of candles to fetch use_cache (bool): Whether to use cached data if available Returns: pd.DataFrame: DataFrame with OHLCV data """ cache_file = os.path.join(self.data_dir, f"{self.symbol.replace('/', '_')}_{timeframe}.csv") # Check if cached data exists and is recent if use_cache and os.path.exists(cache_file): try: df = pd.read_csv(cache_file, parse_dates=['timestamp']) # If we have enough data and it's recent, use it if len(df) >= n_candles: logger.info(f"Using cached data for {self.symbol} {timeframe} ({len(df)} candles)") self.dataframes[timeframe] = df return df.tail(n_candles) except Exception as e: logger.error(f"Error reading cached data: {str(e)}") # If we get here, we need to fetch data # For now, we'll use a placeholder for fetching data from an exchange try: # In a real implementation, we would fetch data from an exchange or API here # For this example, we'll create dummy data if we can't load from cache logger.info(f"Fetching historical data for {self.symbol} {timeframe}") # Placeholder for real data fetching # In a real implementation, this would be replaced with API calls self._fetch_data_from_exchange(timeframe, n_candles) # Save to cache if self.dataframes[timeframe] is not None: self.dataframes[timeframe].to_csv(cache_file, index=False) return self.dataframes[timeframe] else: # Create dummy data as fallback logger.warning(f"Could not fetch data for {self.symbol} {timeframe}, using dummy data") df = self._create_dummy_data(timeframe, n_candles) self.dataframes[timeframe] = df return df except Exception as e: logger.error(f"Error fetching data: {str(e)}") return None def _fetch_data_from_exchange(self, timeframe, n_candles): """ Placeholder method for fetching data from an exchange. In a real implementation, this would connect to an exchange API. """ # This is a placeholder - in a real implementation this would make API calls # to a cryptocurrency exchange to fetch OHLCV data # For now, just generate dummy data self.dataframes[timeframe] = self._create_dummy_data(timeframe, n_candles) def _create_dummy_data(self, timeframe, n_candles): """ Create dummy OHLCV data for testing purposes. Args: timeframe (str): Timeframe to create data for n_candles (int): Number of candles to create Returns: pd.DataFrame: DataFrame with dummy OHLCV data """ # Map timeframe to seconds tf_seconds = { '1m': 60, '5m': 300, '15m': 900, '1h': 3600, '4h': 14400, '1d': 86400 } seconds = tf_seconds.get(timeframe, 3600) # Default to 1h # Create timestamps end_time = datetime.now() timestamps = [end_time - timedelta(seconds=seconds * i) for i in range(n_candles)] timestamps.reverse() # Oldest first # Generate random price data with realistic patterns np.random.seed(42) # For reproducibility # Start price price = 50000 # For BTC/USDT prices = [] volumes = [] for i in range(n_candles): # Random walk with drift and volatility based on timeframe drift = 0.0001 * seconds # Larger drift for larger timeframes volatility = 0.01 * np.sqrt(seconds / 3600) # Scale volatility by sqrt of time # Daily/weekly patterns if timeframe in ['1d', '4h']: # Add some cyclical patterns cycle = np.sin(i / 7 * np.pi) * 0.02 # Weekly cycle else: cycle = np.sin(i / 24 * np.pi) * 0.01 # Daily cycle # Calculate price change with random walk + cycles price_change = price * (drift + volatility * np.random.randn() + cycle) price += price_change # Generate OHLC from the price open_price = price high_price = price * (1 + abs(0.005 * np.random.randn())) low_price = price * (1 - abs(0.005 * np.random.randn())) close_price = price * (1 + 0.002 * np.random.randn()) # Ensure high >= open, close, low and low <= open, close high_price = max(high_price, open_price, close_price) low_price = min(low_price, open_price, close_price) # Generate volume (higher for larger price movements) volume = abs(price_change) * (10000 + 5000 * np.random.rand()) prices.append((open_price, high_price, low_price, close_price)) volumes.append(volume) # Update price for next iteration price = close_price # Create DataFrame df = pd.DataFrame( [(t, o, h, l, c, v) for t, (o, h, l, c), v in zip(timestamps, prices, volumes)], columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'] ) return df def prepare_nn_input(self, timeframes=None, n_candles=500, window_size=20): """ Prepare input data for neural network models. Args: timeframes (list): List of timeframes to use n_candles (int): Number of candles to fetch for each timeframe window_size (int): Size of the sliding window for feature creation Returns: tuple: (X, y, timestamps) where: X is the input features array with shape (n_samples, window_size, n_features) y is the target array with shape (n_samples,) timestamps is an array of timestamps for each sample """ if timeframes is None: timeframes = self.timeframes # Get data for all requested timeframes dfs = {} for tf in timeframes: df = self.get_historical_data(timeframe=tf, n_candles=n_candles) if df is not None and not df.empty: dfs[tf] = df if not dfs: logger.error("No data available for feature creation") return None, None, None # For simplicity, we'll use just one timeframe for now # In a more complex implementation, we would merge multiple timeframes primary_tf = timeframes[0] if primary_tf not in dfs: logger.error(f"Primary timeframe {primary_tf} not available") return None, None, None df = dfs[primary_tf] # Create features X, y, timestamps = self._create_features(df, window_size) return X, y, timestamps def _create_features(self, df, window_size): """ Create features from OHLCV data using a sliding window approach. Args: df (pd.DataFrame): DataFrame with OHLCV data window_size (int): Size of the sliding window Returns: tuple: (X, y, timestamps) where: X is the input features array y is the target array timestamps is an array of timestamps for each sample """ # Extract OHLCV columns ohlcv = df[['open', 'high', 'low', 'close', 'volume']].values # Scale the data scaler = MinMaxScaler() ohlcv_scaled = scaler.fit_transform(ohlcv) # Store the scaler for later use timeframe = next((tf for tf in self.timeframes if self.dataframes.get(tf) is not None and self.dataframes[tf].equals(df)), 'unknown') self.scalers[timeframe] = scaler # Create sliding windows X = [] y = [] timestamps = [] for i in range(len(ohlcv_scaled) - window_size): # Input: window_size candles of OHLCV data X.append(ohlcv_scaled[i:i+window_size]) # Target: binary classification - price goes up (1) or down (0) # 1 if close price increases in the next candle, 0 otherwise price_change = ohlcv[i+window_size, 3] - ohlcv[i+window_size-1, 3] y.append(1 if price_change > 0 else 0) # Store timestamp for reference timestamps.append(df['timestamp'].iloc[i+window_size]) return np.array(X), np.array(y), np.array(timestamps) def generate_training_dataset(self, timeframes=None, n_candles=1000, window_size=20): """ Generate and save a training dataset for neural network models. Args: timeframes (list): List of timeframes to use n_candles (int): Number of candles to fetch for each timeframe window_size (int): Size of the sliding window for feature creation Returns: dict: Dictionary of dataset file paths """ if timeframes is None: timeframes = self.timeframes # Prepare inputs X, y, timestamps = self.prepare_nn_input(timeframes, n_candles, window_size) if X is None or y is None: logger.error("Failed to prepare input data for dataset") return None # Prepare output paths timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S") dataset_name = f"{self.symbol.replace('/', '_')}_{'_'.join(timeframes)}_{timestamp_str}" X_path = os.path.join(self.data_dir, f"{dataset_name}_X.npy") y_path = os.path.join(self.data_dir, f"{dataset_name}_y.npy") timestamps_path = os.path.join(self.data_dir, f"{dataset_name}_timestamps.npy") metadata_path = os.path.join(self.data_dir, f"{dataset_name}_metadata.json") # Save arrays np.save(X_path, X) np.save(y_path, y) np.save(timestamps_path, timestamps) # Save metadata metadata = { 'symbol': self.symbol, 'timeframes': timeframes, 'window_size': window_size, 'n_samples': len(X), 'feature_shape': X.shape[1:], 'created_at': datetime.now().isoformat(), 'dataset_name': dataset_name } with open(metadata_path, 'w') as f: json.dump(metadata, f, indent=2) # Save scalers scaler_path = os.path.join(self.data_dir, f"{dataset_name}_scalers.pkl") with open(scaler_path, 'wb') as f: pickle.dump(self.scalers, f) # Return dataset info dataset_info = { 'X_path': X_path, 'y_path': y_path, 'timestamps_path': timestamps_path, 'metadata_path': metadata_path, 'scaler_path': scaler_path } logger.info(f"Dataset generated and saved: {dataset_name}") return dataset_info def prepare_realtime_input(self, timeframe='1h', n_candles=30, window_size=20): """ Prepare a single input sample from the most recent data for real-time inference. Args: timeframe (str): Timeframe to use n_candles (int): Number of recent candles to fetch window_size (int): Size of the sliding window Returns: tuple: (X, timestamp) where: X is the input features array with shape (1, window_size, n_features) timestamp is the timestamp of the most recent candle """ # Get recent data df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles, use_cache=False) if df is None or len(df) < window_size: logger.error(f"Not enough data for inference (need at least {window_size} candles)") return None, None # Extract features from the most recent window ohlcv = df[['open', 'high', 'low', 'close', 'volume']].tail(window_size).values # Scale the data if timeframe in self.scalers: # Use existing scaler scaler = self.scalers[timeframe] else: # Create new scaler scaler = MinMaxScaler() # Fit on all available data all_data = df[['open', 'high', 'low', 'close', 'volume']].values scaler.fit(all_data) self.scalers[timeframe] = scaler ohlcv_scaled = scaler.transform(ohlcv) # Reshape to (1, window_size, n_features) X = np.array([ohlcv_scaled]) # Get timestamp of the most recent candle timestamp = df['timestamp'].iloc[-1] return X, timestamp