new nn wip
This commit is contained in:
390
NN/utils/data_interface.py
Normal file
390
NN/utils/data_interface.py
Normal file
@ -0,0 +1,390 @@
|
||||
"""
|
||||
Data Interface for Neural Network Trading System
|
||||
|
||||
This module provides functionality to fetch, process, and prepare data for the neural network models.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import pickle
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataInterface:
|
||||
"""
|
||||
Handles data collection, processing, and preparation for neural network models.
|
||||
|
||||
This class is responsible for:
|
||||
1. Fetching historical data
|
||||
2. Preprocessing data for neural network input
|
||||
3. Generating training datasets
|
||||
4. Handling real-time data integration
|
||||
"""
|
||||
|
||||
def __init__(self, symbol="BTC/USDT", timeframes=None, data_dir="NN/data"):
|
||||
"""
|
||||
Initialize the data interface.
|
||||
|
||||
Args:
|
||||
symbol (str): Trading pair symbol (e.g., "BTC/USDT")
|
||||
timeframes (list): List of timeframes to use (e.g., ['1m', '5m', '1h', '4h', '1d'])
|
||||
data_dir (str): Directory to store/load datasets
|
||||
"""
|
||||
self.symbol = symbol
|
||||
self.timeframes = timeframes or ['1h', '4h', '1d']
|
||||
self.data_dir = data_dir
|
||||
self.scalers = {} # Store scalers for each timeframe
|
||||
|
||||
# Create data directory if it doesn't exist
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
# Initialize empty dataframes for each timeframe
|
||||
self.dataframes = {tf: None for tf in self.timeframes}
|
||||
|
||||
logger.info(f"DataInterface initialized for {symbol} with timeframes {timeframes}")
|
||||
|
||||
def get_historical_data(self, timeframe='1h', n_candles=1000, use_cache=True):
|
||||
"""
|
||||
Fetch historical price data for a given timeframe.
|
||||
|
||||
Args:
|
||||
timeframe (str): Timeframe to fetch data for
|
||||
n_candles (int): Number of candles to fetch
|
||||
use_cache (bool): Whether to use cached data if available
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with OHLCV data
|
||||
"""
|
||||
cache_file = os.path.join(self.data_dir, f"{self.symbol.replace('/', '_')}_{timeframe}.csv")
|
||||
|
||||
# Check if cached data exists and is recent
|
||||
if use_cache and os.path.exists(cache_file):
|
||||
try:
|
||||
df = pd.read_csv(cache_file, parse_dates=['timestamp'])
|
||||
# If we have enough data and it's recent, use it
|
||||
if len(df) >= n_candles:
|
||||
logger.info(f"Using cached data for {self.symbol} {timeframe} ({len(df)} candles)")
|
||||
self.dataframes[timeframe] = df
|
||||
return df.tail(n_candles)
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading cached data: {str(e)}")
|
||||
|
||||
# If we get here, we need to fetch data
|
||||
# For now, we'll use a placeholder for fetching data from an exchange
|
||||
try:
|
||||
# In a real implementation, we would fetch data from an exchange or API here
|
||||
# For this example, we'll create dummy data if we can't load from cache
|
||||
logger.info(f"Fetching historical data for {self.symbol} {timeframe}")
|
||||
|
||||
# Placeholder for real data fetching
|
||||
# In a real implementation, this would be replaced with API calls
|
||||
self._fetch_data_from_exchange(timeframe, n_candles)
|
||||
|
||||
# Save to cache
|
||||
if self.dataframes[timeframe] is not None:
|
||||
self.dataframes[timeframe].to_csv(cache_file, index=False)
|
||||
return self.dataframes[timeframe]
|
||||
else:
|
||||
# Create dummy data as fallback
|
||||
logger.warning(f"Could not fetch data for {self.symbol} {timeframe}, using dummy data")
|
||||
df = self._create_dummy_data(timeframe, n_candles)
|
||||
self.dataframes[timeframe] = df
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching data: {str(e)}")
|
||||
return None
|
||||
|
||||
def _fetch_data_from_exchange(self, timeframe, n_candles):
|
||||
"""
|
||||
Placeholder method for fetching data from an exchange.
|
||||
In a real implementation, this would connect to an exchange API.
|
||||
"""
|
||||
# This is a placeholder - in a real implementation this would make API calls
|
||||
# to a cryptocurrency exchange to fetch OHLCV data
|
||||
|
||||
# For now, just generate dummy data
|
||||
self.dataframes[timeframe] = self._create_dummy_data(timeframe, n_candles)
|
||||
|
||||
def _create_dummy_data(self, timeframe, n_candles):
|
||||
"""
|
||||
Create dummy OHLCV data for testing purposes.
|
||||
|
||||
Args:
|
||||
timeframe (str): Timeframe to create data for
|
||||
n_candles (int): Number of candles to create
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with dummy OHLCV data
|
||||
"""
|
||||
# Map timeframe to seconds
|
||||
tf_seconds = {
|
||||
'1m': 60,
|
||||
'5m': 300,
|
||||
'15m': 900,
|
||||
'1h': 3600,
|
||||
'4h': 14400,
|
||||
'1d': 86400
|
||||
}
|
||||
seconds = tf_seconds.get(timeframe, 3600) # Default to 1h
|
||||
|
||||
# Create timestamps
|
||||
end_time = datetime.now()
|
||||
timestamps = [end_time - timedelta(seconds=seconds * i) for i in range(n_candles)]
|
||||
timestamps.reverse() # Oldest first
|
||||
|
||||
# Generate random price data with realistic patterns
|
||||
np.random.seed(42) # For reproducibility
|
||||
|
||||
# Start price
|
||||
price = 50000 # For BTC/USDT
|
||||
prices = []
|
||||
volumes = []
|
||||
|
||||
for i in range(n_candles):
|
||||
# Random walk with drift and volatility based on timeframe
|
||||
drift = 0.0001 * seconds # Larger drift for larger timeframes
|
||||
volatility = 0.01 * np.sqrt(seconds / 3600) # Scale volatility by sqrt of time
|
||||
|
||||
# Daily/weekly patterns
|
||||
if timeframe in ['1d', '4h']:
|
||||
# Add some cyclical patterns
|
||||
cycle = np.sin(i / 7 * np.pi) * 0.02 # Weekly cycle
|
||||
else:
|
||||
cycle = np.sin(i / 24 * np.pi) * 0.01 # Daily cycle
|
||||
|
||||
# Calculate price change with random walk + cycles
|
||||
price_change = price * (drift + volatility * np.random.randn() + cycle)
|
||||
price += price_change
|
||||
|
||||
# Generate OHLC from the price
|
||||
open_price = price
|
||||
high_price = price * (1 + abs(0.005 * np.random.randn()))
|
||||
low_price = price * (1 - abs(0.005 * np.random.randn()))
|
||||
close_price = price * (1 + 0.002 * np.random.randn())
|
||||
|
||||
# Ensure high >= open, close, low and low <= open, close
|
||||
high_price = max(high_price, open_price, close_price)
|
||||
low_price = min(low_price, open_price, close_price)
|
||||
|
||||
# Generate volume (higher for larger price movements)
|
||||
volume = abs(price_change) * (10000 + 5000 * np.random.rand())
|
||||
|
||||
prices.append((open_price, high_price, low_price, close_price))
|
||||
volumes.append(volume)
|
||||
|
||||
# Update price for next iteration
|
||||
price = close_price
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(
|
||||
[(t, o, h, l, c, v) for t, (o, h, l, c), v in zip(timestamps, prices, volumes)],
|
||||
columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
def prepare_nn_input(self, timeframes=None, n_candles=500, window_size=20):
|
||||
"""
|
||||
Prepare input data for neural network models.
|
||||
|
||||
Args:
|
||||
timeframes (list): List of timeframes to use
|
||||
n_candles (int): Number of candles to fetch for each timeframe
|
||||
window_size (int): Size of the sliding window for feature creation
|
||||
|
||||
Returns:
|
||||
tuple: (X, y, timestamps) where:
|
||||
X is the input features array with shape (n_samples, window_size, n_features)
|
||||
y is the target array with shape (n_samples,)
|
||||
timestamps is an array of timestamps for each sample
|
||||
"""
|
||||
if timeframes is None:
|
||||
timeframes = self.timeframes
|
||||
|
||||
# Get data for all requested timeframes
|
||||
dfs = {}
|
||||
for tf in timeframes:
|
||||
df = self.get_historical_data(timeframe=tf, n_candles=n_candles)
|
||||
if df is not None and not df.empty:
|
||||
dfs[tf] = df
|
||||
|
||||
if not dfs:
|
||||
logger.error("No data available for feature creation")
|
||||
return None, None, None
|
||||
|
||||
# For simplicity, we'll use just one timeframe for now
|
||||
# In a more complex implementation, we would merge multiple timeframes
|
||||
primary_tf = timeframes[0]
|
||||
if primary_tf not in dfs:
|
||||
logger.error(f"Primary timeframe {primary_tf} not available")
|
||||
return None, None, None
|
||||
|
||||
df = dfs[primary_tf]
|
||||
|
||||
# Create features
|
||||
X, y, timestamps = self._create_features(df, window_size)
|
||||
|
||||
return X, y, timestamps
|
||||
|
||||
def _create_features(self, df, window_size):
|
||||
"""
|
||||
Create features from OHLCV data using a sliding window approach.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): DataFrame with OHLCV data
|
||||
window_size (int): Size of the sliding window
|
||||
|
||||
Returns:
|
||||
tuple: (X, y, timestamps) where:
|
||||
X is the input features array
|
||||
y is the target array
|
||||
timestamps is an array of timestamps for each sample
|
||||
"""
|
||||
# Extract OHLCV columns
|
||||
ohlcv = df[['open', 'high', 'low', 'close', 'volume']].values
|
||||
|
||||
# Scale the data
|
||||
scaler = MinMaxScaler()
|
||||
ohlcv_scaled = scaler.fit_transform(ohlcv)
|
||||
|
||||
# Store the scaler for later use
|
||||
timeframe = next((tf for tf in self.timeframes if self.dataframes.get(tf) is not None and
|
||||
self.dataframes[tf].equals(df)), 'unknown')
|
||||
self.scalers[timeframe] = scaler
|
||||
|
||||
# Create sliding windows
|
||||
X = []
|
||||
y = []
|
||||
timestamps = []
|
||||
|
||||
for i in range(len(ohlcv_scaled) - window_size):
|
||||
# Input: window_size candles of OHLCV data
|
||||
X.append(ohlcv_scaled[i:i+window_size])
|
||||
|
||||
# Target: binary classification - price goes up (1) or down (0)
|
||||
# 1 if close price increases in the next candle, 0 otherwise
|
||||
price_change = ohlcv[i+window_size, 3] - ohlcv[i+window_size-1, 3]
|
||||
y.append(1 if price_change > 0 else 0)
|
||||
|
||||
# Store timestamp for reference
|
||||
timestamps.append(df['timestamp'].iloc[i+window_size])
|
||||
|
||||
return np.array(X), np.array(y), np.array(timestamps)
|
||||
|
||||
def generate_training_dataset(self, timeframes=None, n_candles=1000, window_size=20):
|
||||
"""
|
||||
Generate and save a training dataset for neural network models.
|
||||
|
||||
Args:
|
||||
timeframes (list): List of timeframes to use
|
||||
n_candles (int): Number of candles to fetch for each timeframe
|
||||
window_size (int): Size of the sliding window for feature creation
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of dataset file paths
|
||||
"""
|
||||
if timeframes is None:
|
||||
timeframes = self.timeframes
|
||||
|
||||
# Prepare inputs
|
||||
X, y, timestamps = self.prepare_nn_input(timeframes, n_candles, window_size)
|
||||
|
||||
if X is None or y is None:
|
||||
logger.error("Failed to prepare input data for dataset")
|
||||
return None
|
||||
|
||||
# Prepare output paths
|
||||
timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
dataset_name = f"{self.symbol.replace('/', '_')}_{'_'.join(timeframes)}_{timestamp_str}"
|
||||
|
||||
X_path = os.path.join(self.data_dir, f"{dataset_name}_X.npy")
|
||||
y_path = os.path.join(self.data_dir, f"{dataset_name}_y.npy")
|
||||
timestamps_path = os.path.join(self.data_dir, f"{dataset_name}_timestamps.npy")
|
||||
metadata_path = os.path.join(self.data_dir, f"{dataset_name}_metadata.json")
|
||||
|
||||
# Save arrays
|
||||
np.save(X_path, X)
|
||||
np.save(y_path, y)
|
||||
np.save(timestamps_path, timestamps)
|
||||
|
||||
# Save metadata
|
||||
metadata = {
|
||||
'symbol': self.symbol,
|
||||
'timeframes': timeframes,
|
||||
'window_size': window_size,
|
||||
'n_samples': len(X),
|
||||
'feature_shape': X.shape[1:],
|
||||
'created_at': datetime.now().isoformat(),
|
||||
'dataset_name': dataset_name
|
||||
}
|
||||
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
# Save scalers
|
||||
scaler_path = os.path.join(self.data_dir, f"{dataset_name}_scalers.pkl")
|
||||
with open(scaler_path, 'wb') as f:
|
||||
pickle.dump(self.scalers, f)
|
||||
|
||||
# Return dataset info
|
||||
dataset_info = {
|
||||
'X_path': X_path,
|
||||
'y_path': y_path,
|
||||
'timestamps_path': timestamps_path,
|
||||
'metadata_path': metadata_path,
|
||||
'scaler_path': scaler_path
|
||||
}
|
||||
|
||||
logger.info(f"Dataset generated and saved: {dataset_name}")
|
||||
return dataset_info
|
||||
|
||||
def prepare_realtime_input(self, timeframe='1h', n_candles=30, window_size=20):
|
||||
"""
|
||||
Prepare a single input sample from the most recent data for real-time inference.
|
||||
|
||||
Args:
|
||||
timeframe (str): Timeframe to use
|
||||
n_candles (int): Number of recent candles to fetch
|
||||
window_size (int): Size of the sliding window
|
||||
|
||||
Returns:
|
||||
tuple: (X, timestamp) where:
|
||||
X is the input features array with shape (1, window_size, n_features)
|
||||
timestamp is the timestamp of the most recent candle
|
||||
"""
|
||||
# Get recent data
|
||||
df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles, use_cache=False)
|
||||
|
||||
if df is None or len(df) < window_size:
|
||||
logger.error(f"Not enough data for inference (need at least {window_size} candles)")
|
||||
return None, None
|
||||
|
||||
# Extract features from the most recent window
|
||||
ohlcv = df[['open', 'high', 'low', 'close', 'volume']].tail(window_size).values
|
||||
|
||||
# Scale the data
|
||||
if timeframe in self.scalers:
|
||||
# Use existing scaler
|
||||
scaler = self.scalers[timeframe]
|
||||
else:
|
||||
# Create new scaler
|
||||
scaler = MinMaxScaler()
|
||||
# Fit on all available data
|
||||
all_data = df[['open', 'high', 'low', 'close', 'volume']].values
|
||||
scaler.fit(all_data)
|
||||
self.scalers[timeframe] = scaler
|
||||
|
||||
ohlcv_scaled = scaler.transform(ohlcv)
|
||||
|
||||
# Reshape to (1, window_size, n_features)
|
||||
X = np.array([ohlcv_scaled])
|
||||
|
||||
# Get timestamp of the most recent candle
|
||||
timestamp = df['timestamp'].iloc[-1]
|
||||
|
||||
return X, timestamp
|
Reference in New Issue
Block a user