new nn wip

2025-03-25 13:38:25 +02:00
parent 50eb50696b
commit 0042581275
18 changed files with 3358 additions and 294 deletions
--- a/NN/utils/data_interface.py
+++ b/NN/utils/data_interface.py
@@ -0,0 +1,390 @@
+"""
+Data Interface for Neural Network Trading System
+
+This module provides functionality to fetch, process, and prepare data for the neural network models.
+"""
+
+import os
+import logging
+import numpy as np
+import pandas as pd
+from datetime import datetime, timedelta
+import json
+import pickle
+from sklearn.preprocessing import MinMaxScaler
+
+logger = logging.getLogger(__name__)
+
+class DataInterface:
+    """
+    Handles data collection, processing, and preparation for neural network models.
+    
+    This class is responsible for:
+    1. Fetching historical data
+    2. Preprocessing data for neural network input
+    3. Generating training datasets
+    4. Handling real-time data integration
+    """
+    
+    def __init__(self, symbol="BTC/USDT", timeframes=None, data_dir="NN/data"):
+        """
+        Initialize the data interface.
+        
+        Args:
+            symbol (str): Trading pair symbol (e.g., "BTC/USDT")
+            timeframes (list): List of timeframes to use (e.g., ['1m', '5m', '1h', '4h', '1d'])
+            data_dir (str): Directory to store/load datasets
+        """
+        self.symbol = symbol
+        self.timeframes = timeframes or ['1h', '4h', '1d']
+        self.data_dir = data_dir
+        self.scalers = {}  # Store scalers for each timeframe
+        
+        # Create data directory if it doesn't exist
+        os.makedirs(self.data_dir, exist_ok=True)
+        
+        # Initialize empty dataframes for each timeframe
+        self.dataframes = {tf: None for tf in self.timeframes}
+        
+        logger.info(f"DataInterface initialized for {symbol} with timeframes {timeframes}")
+    
+    def get_historical_data(self, timeframe='1h', n_candles=1000, use_cache=True):
+        """
+        Fetch historical price data for a given timeframe.
+        
+        Args:
+            timeframe (str): Timeframe to fetch data for
+            n_candles (int): Number of candles to fetch
+            use_cache (bool): Whether to use cached data if available
+            
+        Returns:
+            pd.DataFrame: DataFrame with OHLCV data
+        """
+        cache_file = os.path.join(self.data_dir, f"{self.symbol.replace('/', '_')}_{timeframe}.csv")
+        
+        # Check if cached data exists and is recent
+        if use_cache and os.path.exists(cache_file):
+            try:
+                df = pd.read_csv(cache_file, parse_dates=['timestamp'])
+                # If we have enough data and it's recent, use it
+                if len(df) >= n_candles:
+                    logger.info(f"Using cached data for {self.symbol} {timeframe} ({len(df)} candles)")
+                    self.dataframes[timeframe] = df
+                    return df.tail(n_candles)
+            except Exception as e:
+                logger.error(f"Error reading cached data: {str(e)}")
+        
+        # If we get here, we need to fetch data
+        # For now, we'll use a placeholder for fetching data from an exchange
+        try:
+            # In a real implementation, we would fetch data from an exchange or API here
+            # For this example, we'll create dummy data if we can't load from cache
+            logger.info(f"Fetching historical data for {self.symbol} {timeframe}")
+            
+            # Placeholder for real data fetching
+            # In a real implementation, this would be replaced with API calls
+            self._fetch_data_from_exchange(timeframe, n_candles)
+            
+            # Save to cache
+            if self.dataframes[timeframe] is not None:
+                self.dataframes[timeframe].to_csv(cache_file, index=False)
+                return self.dataframes[timeframe]
+            else:
+                # Create dummy data as fallback
+                logger.warning(f"Could not fetch data for {self.symbol} {timeframe}, using dummy data")
+                df = self._create_dummy_data(timeframe, n_candles)
+                self.dataframes[timeframe] = df
+                return df
+        except Exception as e:
+            logger.error(f"Error fetching data: {str(e)}")
+            return None
+    
+    def _fetch_data_from_exchange(self, timeframe, n_candles):
+        """
+        Placeholder method for fetching data from an exchange.
+        In a real implementation, this would connect to an exchange API.
+        """
+        # This is a placeholder - in a real implementation this would make API calls
+        # to a cryptocurrency exchange to fetch OHLCV data
+        
+        # For now, just generate dummy data
+        self.dataframes[timeframe] = self._create_dummy_data(timeframe, n_candles)
+    
+    def _create_dummy_data(self, timeframe, n_candles):
+        """
+        Create dummy OHLCV data for testing purposes.
+        
+        Args:
+            timeframe (str): Timeframe to create data for
+            n_candles (int): Number of candles to create
+            
+        Returns:
+            pd.DataFrame: DataFrame with dummy OHLCV data
+        """
+        # Map timeframe to seconds
+        tf_seconds = {
+            '1m': 60,
+            '5m': 300,
+            '15m': 900,
+            '1h': 3600,
+            '4h': 14400,
+            '1d': 86400
+        }
+        seconds = tf_seconds.get(timeframe, 3600)  # Default to 1h
+        
+        # Create timestamps
+        end_time = datetime.now()
+        timestamps = [end_time - timedelta(seconds=seconds * i) for i in range(n_candles)]
+        timestamps.reverse()  # Oldest first
+        
+        # Generate random price data with realistic patterns
+        np.random.seed(42)  # For reproducibility
+        
+        # Start price
+        price = 50000  # For BTC/USDT
+        prices = []
+        volumes = []
+        
+        for i in range(n_candles):
+            # Random walk with drift and volatility based on timeframe
+            drift = 0.0001 * seconds  # Larger drift for larger timeframes
+            volatility = 0.01 * np.sqrt(seconds / 3600)  # Scale volatility by sqrt of time
+            
+            # Daily/weekly patterns
+            if timeframe in ['1d', '4h']:
+                # Add some cyclical patterns
+                cycle = np.sin(i / 7 * np.pi) * 0.02  # Weekly cycle
+            else:
+                cycle = np.sin(i / 24 * np.pi) * 0.01  # Daily cycle
+            
+            # Calculate price change with random walk + cycles
+            price_change = price * (drift + volatility * np.random.randn() + cycle)
+            price += price_change
+            
+            # Generate OHLC from the price
+            open_price = price
+            high_price = price * (1 + abs(0.005 * np.random.randn()))
+            low_price = price * (1 - abs(0.005 * np.random.randn()))
+            close_price = price * (1 + 0.002 * np.random.randn())
+            
+            # Ensure high >= open, close, low and low <= open, close
+            high_price = max(high_price, open_price, close_price)
+            low_price = min(low_price, open_price, close_price)
+            
+            # Generate volume (higher for larger price movements)
+            volume = abs(price_change) * (10000 + 5000 * np.random.rand())
+            
+            prices.append((open_price, high_price, low_price, close_price))
+            volumes.append(volume)
+            
+            # Update price for next iteration
+            price = close_price
+        
+        # Create DataFrame
+        df = pd.DataFrame(
+            [(t, o, h, l, c, v) for t, (o, h, l, c), v in zip(timestamps, prices, volumes)],
+            columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']
+        )
+        
+        return df
+    
+    def prepare_nn_input(self, timeframes=None, n_candles=500, window_size=20):
+        """
+        Prepare input data for neural network models.
+        
+        Args:
+            timeframes (list): List of timeframes to use
+            n_candles (int): Number of candles to fetch for each timeframe
+            window_size (int): Size of the sliding window for feature creation
+            
+        Returns:
+            tuple: (X, y, timestamps) where:
+                X is the input features array with shape (n_samples, window_size, n_features)
+                y is the target array with shape (n_samples,)
+                timestamps is an array of timestamps for each sample
+        """
+        if timeframes is None:
+            timeframes = self.timeframes
+        
+        # Get data for all requested timeframes
+        dfs = {}
+        for tf in timeframes:
+            df = self.get_historical_data(timeframe=tf, n_candles=n_candles)
+            if df is not None and not df.empty:
+                dfs[tf] = df
+        
+        if not dfs:
+            logger.error("No data available for feature creation")
+            return None, None, None
+        
+        # For simplicity, we'll use just one timeframe for now
+        # In a more complex implementation, we would merge multiple timeframes
+        primary_tf = timeframes[0]
+        if primary_tf not in dfs:
+            logger.error(f"Primary timeframe {primary_tf} not available")
+            return None, None, None
+        
+        df = dfs[primary_tf]
+        
+        # Create features
+        X, y, timestamps = self._create_features(df, window_size)
+        
+        return X, y, timestamps
+    
+    def _create_features(self, df, window_size):
+        """
+        Create features from OHLCV data using a sliding window approach.
+        
+        Args:
+            df (pd.DataFrame): DataFrame with OHLCV data
+            window_size (int): Size of the sliding window
+            
+        Returns:
+            tuple: (X, y, timestamps) where:
+                X is the input features array
+                y is the target array
+                timestamps is an array of timestamps for each sample
+        """
+        # Extract OHLCV columns
+        ohlcv = df[['open', 'high', 'low', 'close', 'volume']].values
+        
+        # Scale the data
+        scaler = MinMaxScaler()
+        ohlcv_scaled = scaler.fit_transform(ohlcv)
+        
+        # Store the scaler for later use
+        timeframe = next((tf for tf in self.timeframes if self.dataframes.get(tf) is not None and 
+                         self.dataframes[tf].equals(df)), 'unknown')
+        self.scalers[timeframe] = scaler
+        
+        # Create sliding windows
+        X = []
+        y = []
+        timestamps = []
+        
+        for i in range(len(ohlcv_scaled) - window_size):
+            # Input: window_size candles of OHLCV data
+            X.append(ohlcv_scaled[i:i+window_size])
+            
+            # Target: binary classification - price goes up (1) or down (0)
+            # 1 if close price increases in the next candle, 0 otherwise
+            price_change = ohlcv[i+window_size, 3] - ohlcv[i+window_size-1, 3]
+            y.append(1 if price_change > 0 else 0)
+            
+            # Store timestamp for reference
+            timestamps.append(df['timestamp'].iloc[i+window_size])
+        
+        return np.array(X), np.array(y), np.array(timestamps)
+    
+    def generate_training_dataset(self, timeframes=None, n_candles=1000, window_size=20):
+        """
+        Generate and save a training dataset for neural network models.
+        
+        Args:
+            timeframes (list): List of timeframes to use
+            n_candles (int): Number of candles to fetch for each timeframe
+            window_size (int): Size of the sliding window for feature creation
+            
+        Returns:
+            dict: Dictionary of dataset file paths
+        """
+        if timeframes is None:
+            timeframes = self.timeframes
+        
+        # Prepare inputs
+        X, y, timestamps = self.prepare_nn_input(timeframes, n_candles, window_size)
+        
+        if X is None or y is None:
+            logger.error("Failed to prepare input data for dataset")
+            return None
+        
+        # Prepare output paths
+        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
+        dataset_name = f"{self.symbol.replace('/', '_')}_{'_'.join(timeframes)}_{timestamp_str}"
+        
+        X_path = os.path.join(self.data_dir, f"{dataset_name}_X.npy")
+        y_path = os.path.join(self.data_dir, f"{dataset_name}_y.npy")
+        timestamps_path = os.path.join(self.data_dir, f"{dataset_name}_timestamps.npy")
+        metadata_path = os.path.join(self.data_dir, f"{dataset_name}_metadata.json")
+        
+        # Save arrays
+        np.save(X_path, X)
+        np.save(y_path, y)
+        np.save(timestamps_path, timestamps)
+        
+        # Save metadata
+        metadata = {
+            'symbol': self.symbol,
+            'timeframes': timeframes,
+            'window_size': window_size,
+            'n_samples': len(X),
+            'feature_shape': X.shape[1:],
+            'created_at': datetime.now().isoformat(),
+            'dataset_name': dataset_name
+        }
+        
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        
+        # Save scalers
+        scaler_path = os.path.join(self.data_dir, f"{dataset_name}_scalers.pkl")
+        with open(scaler_path, 'wb') as f:
+            pickle.dump(self.scalers, f)
+        
+        # Return dataset info
+        dataset_info = {
+            'X_path': X_path,
+            'y_path': y_path,
+            'timestamps_path': timestamps_path,
+            'metadata_path': metadata_path,
+            'scaler_path': scaler_path
+        }
+        
+        logger.info(f"Dataset generated and saved: {dataset_name}")
+        return dataset_info
+    
+    def prepare_realtime_input(self, timeframe='1h', n_candles=30, window_size=20):
+        """
+        Prepare a single input sample from the most recent data for real-time inference.
+        
+        Args:
+            timeframe (str): Timeframe to use
+            n_candles (int): Number of recent candles to fetch
+            window_size (int): Size of the sliding window
+            
+        Returns:
+            tuple: (X, timestamp) where:
+                X is the input features array with shape (1, window_size, n_features)
+                timestamp is the timestamp of the most recent candle
+        """
+        # Get recent data
+        df = self.get_historical_data(timeframe=timeframe, n_candles=n_candles, use_cache=False)
+        
+        if df is None or len(df) < window_size:
+            logger.error(f"Not enough data for inference (need at least {window_size} candles)")
+            return None, None
+        
+        # Extract features from the most recent window
+        ohlcv = df[['open', 'high', 'low', 'close', 'volume']].tail(window_size).values
+        
+        # Scale the data
+        if timeframe in self.scalers:
+            # Use existing scaler
+            scaler = self.scalers[timeframe]
+        else:
+            # Create new scaler
+            scaler = MinMaxScaler()
+            # Fit on all available data
+            all_data = df[['open', 'high', 'low', 'close', 'volume']].values
+            scaler.fit(all_data)
+            self.scalers[timeframe] = scaler
+        
+        ohlcv_scaled = scaler.transform(ohlcv)
+        
+        # Reshape to (1, window_size, n_features)
+        X = np.array([ohlcv_scaled])
+        
+        # Get timestamp of the most recent candle
+        timestamp = df['timestamp'].iloc[-1]
+        
+        return X, timestamp