gogo2/data_stream_monitor.py

#!/usr/bin/env python3
"""
Data Stream Monitor for Model Input Capture and Replay

Captures and streams all model input data in console-friendly text format.
Suitable for snapshots, training, and replay functionality.
"""

import logging
import json
import time
from datetime import datetime
from typing import Dict, List, Any, Optional
from collections import deque
import threading
import os

# Set up separate logger for data stream monitor
stream_logger = logging.getLogger('data_stream_monitor')
stream_logger.setLevel(logging.INFO)

# Create file handler for data stream logs
stream_log_file = os.path.join('logs', 'data_stream_monitor.log')
os.makedirs(os.path.dirname(stream_log_file), exist_ok=True)

stream_handler = logging.FileHandler(stream_log_file)
stream_handler.setLevel(logging.INFO)

# Create formatter
stream_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(stream_formatter)

# Add handler to logger (only if not already added)
if not stream_logger.handlers:
    stream_logger.addHandler(stream_handler)

# Prevent propagation to root logger to avoid duplicate logs
stream_logger.propagate = False

logger = logging.getLogger(__name__)

class DataStreamMonitor:
    """Monitors and streams all model input data for training and replay"""

    def __init__(self, orchestrator=None, data_provider=None, training_system=None):
        self.orchestrator = orchestrator
        self.data_provider = data_provider
        self.training_system = training_system

        # Data buffers for streaming (expanded for accessing historical data)
        self.data_streams = {
            'ohlcv_1s': deque(maxlen=300),     # 300 seconds for 1s data
            'ohlcv_1m': deque(maxlen=300),     # 300 minutes for 1m data (ETH)
            'ohlcv_1h': deque(maxlen=300),     # 300 hours for 1h data (ETH)
            'ohlcv_1d': deque(maxlen=300),     # 300 days for 1d data (ETH)
            'btc_1m': deque(maxlen=300),       # 300 minutes for BTC 1m data
            'ohlcv_5m': deque(maxlen=100),     # Keep for compatibility
            'ohlcv_15m': deque(maxlen=100),    # Keep for compatibility
            'ticks': deque(maxlen=200),
            'cob_raw': deque(maxlen=100),
            'cob_aggregated': deque(maxlen=50),
            'technical_indicators': deque(maxlen=100),
            'model_states': deque(maxlen=50),
            'predictions': deque(maxlen=100),
            'training_experiences': deque(maxlen=200)
        }

        # Streaming configuration - expanded for model requirements
        self.stream_config = {
            'console_output': True,
            'compact_format': False,
            'include_timestamps': True,
            'filter_symbols': ['ETH/USDT', 'BTC/USDT'],  # Primary and secondary symbols
            'primary_symbol': 'ETH/USDT',
            'secondary_symbol': 'BTC/USDT',
            'timeframes': ['1s', '1m', '1h', '1d'],  # Required timeframes for models
            'sampling_rate': 1.0  # seconds between samples
        }

        self.is_streaming = False
        self.stream_thread = None
        self.last_sample_time = 0

        logger.info("DataStreamMonitor initialized")

    def start_streaming(self):
        """Start the data streaming thread"""
        if self.is_streaming:
            logger.warning("Data streaming already active")
            return

        self.is_streaming = True
        self.stream_thread = threading.Thread(target=self._streaming_worker, daemon=True)
        self.stream_thread.start()
        logger.info("Data streaming started")

    def stop_streaming(self):
        """Stop the data streaming"""
        self.is_streaming = False
        if self.stream_thread:
            self.stream_thread.join(timeout=2)
        logger.info("Data streaming stopped")

    def _streaming_worker(self):
        """Main streaming worker that collects and outputs data"""
        while self.is_streaming:
            try:
                current_time = time.time()
                if current_time - self.last_sample_time >= self.stream_config['sampling_rate']:
                    self._collect_data_sample()
                    self._output_data_sample()
                    self.last_sample_time = current_time

                time.sleep(0.5)  # Check every 500ms

            except Exception as e:
                logger.error(f"Error in streaming worker: {e}")
                time.sleep(2)

    def _collect_data_sample(self):
        """Collect one sample of all data streams"""
        try:
            timestamp = datetime.now()

            # 1. OHLCV Data Collection
            self._collect_ohlcv_data(timestamp)

            # 2. Tick Data Collection
            self._collect_tick_data(timestamp)

            # 3. COB Data Collection
            self._collect_cob_data(timestamp)

            # 4. Technical Indicators
            self._collect_technical_indicators(timestamp)

            # 5. Model States
            self._collect_model_states(timestamp)

            # 6. Predictions
            self._collect_predictions(timestamp)

            # 7. Training Experiences
            self._collect_training_experiences(timestamp)

        except Exception as e:
            logger.error(f"Error collecting data sample: {e}")

    def _collect_ohlcv_data(self, timestamp: datetime):
        """Collect OHLCV data for all timeframes and symbols"""
        try:
            # ETH/USDT data for all required timeframes
            primary_symbol = self.stream_config['primary_symbol']
            for timeframe in ['1m', '1h', '1d']:
                if self.data_provider:
                    # Get recent data (limit=1 for latest, but access historical data when needed)
                    df = self.data_provider.get_historical_data(primary_symbol, timeframe, limit=300)
                    if df is not None and not df.empty:
                        # Get the latest bar
                        latest_bar = {
                            'timestamp': timestamp.isoformat(),
                            'symbol': primary_symbol,
                            'timeframe': timeframe,
                            'open': float(df['open'].iloc[-1]),
                            'high': float(df['high'].iloc[-1]),
                            'low': float(df['low'].iloc[-1]),
                            'close': float(df['close'].iloc[-1]),
                            'volume': float(df['volume'].iloc[-1])
                        }

                        stream_key = f'ohlcv_{timeframe}'

                        # Only add if different from last entry or if stream is empty
                        if len(self.data_streams[stream_key]) == 0 or \
                           self.data_streams[stream_key][-1]['close'] != latest_bar['close']:
                            self.data_streams[stream_key].append(latest_bar)

                            # If stream was empty, populate with historical data
                            if len(self.data_streams[stream_key]) == 1:
                                logger.info(f"Populating {stream_key} with historical data...")
                                self._populate_historical_data(df, stream_key, primary_symbol, timeframe)

            # BTC/USDT 1m data (secondary symbol)
            secondary_symbol = self.stream_config['secondary_symbol']
            if self.data_provider:
                df = self.data_provider.get_historical_data(secondary_symbol, '1m', limit=300)
                if df is not None and not df.empty:
                    latest_bar = {
                        'timestamp': timestamp.isoformat(),
                        'symbol': secondary_symbol,
                        'timeframe': '1m',
                        'open': float(df['open'].iloc[-1]),
                        'high': float(df['high'].iloc[-1]),
                        'low': float(df['low'].iloc[-1]),
                        'close': float(df['close'].iloc[-1]),
                        'volume': float(df['volume'].iloc[-1])
                    }

                    # Only add if different from last entry or if stream is empty
                    if len(self.data_streams['btc_1m']) == 0 or \
                       self.data_streams['btc_1m'][-1]['close'] != latest_bar['close']:
                        self.data_streams['btc_1m'].append(latest_bar)

                        # If stream was empty, populate with historical data
                        if len(self.data_streams['btc_1m']) == 1:
                            logger.info("Populating btc_1m with historical data...")
                            self._populate_historical_data(df, 'btc_1m', secondary_symbol, '1m')

            # Legacy timeframes for compatibility
            for timeframe in ['5m', '15m']:
                if self.data_provider:
                    df = self.data_provider.get_historical_data(primary_symbol, timeframe, limit=5)
                    if df is not None and not df.empty:
                        latest_bar = {
                            'timestamp': timestamp.isoformat(),
                            'symbol': primary_symbol,
                            'timeframe': timeframe,
                            'open': float(df['open'].iloc[-1]),
                            'high': float(df['high'].iloc[-1]),
                            'low': float(df['low'].iloc[-1]),
                            'close': float(df['close'].iloc[-1]),
                            'volume': float(df['volume'].iloc[-1])
                        }

                        stream_key = f'ohlcv_{timeframe}'
                        if len(self.data_streams[stream_key]) == 0 or \
                           self.data_streams[stream_key][-1]['timestamp'] != latest_bar['timestamp']:
                            self.data_streams[stream_key].append(latest_bar)

        except Exception as e:
            logger.debug(f"Error collecting OHLCV data: {e}")

    def _populate_historical_data(self, df, stream_key, symbol, timeframe):
        """Populate stream with historical data from DataFrame"""
        try:
            # Clear the stream first (it should only have 1 latest entry)
            self.data_streams[stream_key].clear()

            # Add all historical data
            for _, row in df.iterrows():
                bar_data = {
                    'timestamp': row.name.isoformat() if hasattr(row.name, 'isoformat') else str(row.name),
                    'symbol': symbol,
                    'timeframe': timeframe,
                    'open': float(row['open']),
                    'high': float(row['high']),
                    'low': float(row['low']),
                    'close': float(row['close']),
                    'volume': float(row['volume'])
                }
                self.data_streams[stream_key].append(bar_data)

            logger.info(f"✅ Loaded {len(df)} historical candles for {stream_key} ({symbol} {timeframe})")

        except Exception as e:
            logger.error(f"Error populating historical data for {stream_key}: {e}")

    def _collect_tick_data(self, timestamp: datetime):
        """Collect real-time tick data"""
        try:
            if self.data_provider and hasattr(self.data_provider, 'get_recent_ticks'):
                recent_ticks = self.data_provider.get_recent_ticks(limit=10)
                for tick in recent_ticks:
                    tick_data = {
                        'timestamp': timestamp.isoformat(),
                        'symbol': tick.get('symbol', 'ETH/USDT'),
                        'price': float(tick.get('price', 0)),
                        'volume': float(tick.get('volume', 0)),
                        'side': tick.get('side', 'unknown'),
                        'trade_id': tick.get('trade_id', ''),
                        'is_buyer_maker': tick.get('is_buyer_maker', False)
                    }

                    # Only add if different from last tick
                    if len(self.data_streams['ticks']) == 0 or \
                       self.data_streams['ticks'][-1]['trade_id'] != tick_data['trade_id']:
                        self.data_streams['ticks'].append(tick_data)

        except Exception as e:
            logger.debug(f"Error collecting tick data: {e}")

    def _collect_cob_data(self, timestamp: datetime):
        """Collect COB (Consolidated Order Book) data"""
        try:
            # Raw COB snapshots
            if hasattr(self, 'orchestrator') and self.orchestrator and \
               hasattr(self.orchestrator, 'latest_cob_data'):
                for symbol in self.stream_config['filter_symbols']:
                    if symbol in self.orchestrator.latest_cob_data:
                        cob_data = self.orchestrator.latest_cob_data[symbol]

                        raw_cob = {
                            'timestamp': timestamp.isoformat(),
                            'symbol': symbol,
                            'stats': cob_data.get('stats', {}),
                            'bids_count': len(cob_data.get('bids', [])),
                            'asks_count': len(cob_data.get('asks', [])),
                            'imbalance': cob_data.get('stats', {}).get('imbalance', 0),
                            'spread_bps': cob_data.get('stats', {}).get('spread_bps', 0),
                            'mid_price': cob_data.get('stats', {}).get('mid_price', 0)
                        }

                        self.data_streams['cob_raw'].append(raw_cob)

                        # Top 5 bids and asks for aggregation
                        if cob_data.get('bids') and cob_data.get('asks'):
                            aggregated_cob = {
                                'timestamp': timestamp.isoformat(),
                                'symbol': symbol,
                                'bids': cob_data['bids'][:5],  # Top 5 bids
                                'asks': cob_data['asks'][:5],  # Top 5 asks
                                'imbalance': raw_cob['imbalance'],
                                'spread_bps': raw_cob['spread_bps']
                            }
                            self.data_streams['cob_aggregated'].append(aggregated_cob)

        except Exception as e:
            logger.debug(f"Error collecting COB data: {e}")

    def _collect_technical_indicators(self, timestamp: datetime):
        """Collect technical indicators"""
        try:
            if self.data_provider and hasattr(self.data_provider, 'calculate_technical_indicators'):
                for symbol in self.stream_config['filter_symbols']:
                    indicators = self.data_provider.calculate_technical_indicators(symbol)

                    if indicators:
                        indicator_data = {
                            'timestamp': timestamp.isoformat(),
                            'symbol': symbol,
                            'indicators': indicators
                        }
                        self.data_streams['technical_indicators'].append(indicator_data)

        except Exception as e:
            logger.debug(f"Error collecting technical indicators: {e}")

    def _collect_model_states(self, timestamp: datetime):
        """Collect current model states for each model"""
        try:
            if not self.orchestrator:
                return

            model_states = {}

            # DQN State
            if hasattr(self.orchestrator, 'build_comprehensive_rl_state'):
                for symbol in self.stream_config['filter_symbols']:
                    rl_state = self.orchestrator.build_comprehensive_rl_state(symbol)
                    if rl_state:
                        model_states['dqn'] = {
                            'symbol': symbol,
                            'state_vector': rl_state.get('state_vector', []),
                            'features': rl_state.get('features', {}),
                            'metadata': rl_state.get('metadata', {})
                        }

            # CNN State
            if hasattr(self.orchestrator, 'cnn_model') and self.orchestrator.cnn_model:
                for symbol in self.stream_config['filter_symbols']:
                    if hasattr(self.orchestrator.cnn_model, 'get_state_features'):
                        cnn_features = self.orchestrator.cnn_model.get_state_features(symbol)
                        if cnn_features:
                            model_states['cnn'] = {
                                'symbol': symbol,
                                'features': cnn_features
                            }

            # RL Agent State
            if hasattr(self.orchestrator, 'cob_rl_agent') and self.orchestrator.cob_rl_agent:
                rl_state_data = {
                    'epsilon': getattr(self.orchestrator.cob_rl_agent, 'epsilon', 0),
                    'total_steps': getattr(self.orchestrator.cob_rl_agent, 'total_steps', 0),
                    'current_reward': getattr(self.orchestrator.cob_rl_agent, 'current_reward', 0)
                }
                model_states['rl_agent'] = rl_state_data

            if model_states:
                state_sample = {
                    'timestamp': timestamp.isoformat(),
                    'models': model_states
                }
                self.data_streams['model_states'].append(state_sample)

        except Exception as e:
            logger.debug(f"Error collecting model states: {e}")

    def _collect_predictions(self, timestamp: datetime):
        """Collect recent predictions from all models"""
        try:
            if not self.orchestrator:
                return

            predictions = {}

            # Get predictions from orchestrator
            if hasattr(self.orchestrator, 'get_recent_predictions'):
                recent_preds = self.orchestrator.get_recent_predictions(limit=5)
                for pred in recent_preds:
                    model_name = pred.get('model_name', 'unknown')
                    if model_name not in predictions:
                        predictions[model_name] = []
                    predictions[model_name].append({
                        'timestamp': pred.get('timestamp', timestamp.isoformat()),
                        'symbol': pred.get('symbol', 'ETH/USDT'),
                        'prediction': pred.get('prediction'),
                        'confidence': pred.get('confidence', 0),
                        'action': pred.get('action')
                    })

            if predictions:
                prediction_sample = {
                    'timestamp': timestamp.isoformat(),
                    'predictions': predictions
                }
                self.data_streams['predictions'].append(prediction_sample)

        except Exception as e:
            logger.debug(f"Error collecting predictions: {e}")

    def _collect_training_experiences(self, timestamp: datetime):
        """Collect training experiences from the training system"""
        try:
            if self.training_system and hasattr(self.training_system, 'experience_buffer'):
                # Get recent experiences
                recent_experiences = list(self.training_system.experience_buffer)[-10:]  # Last 10

                for exp in recent_experiences:
                    experience_data = {
                        'timestamp': timestamp.isoformat(),
                        'state': exp.get('state', []),
                        'action': exp.get('action'),
                        'reward': exp.get('reward', 0),
                        'next_state': exp.get('next_state', []),
                        'done': exp.get('done', False),
                        'info': exp.get('info', {})
                    }
                    self.data_streams['training_experiences'].append(experience_data)

        except Exception as e:
            logger.debug(f"Error collecting training experiences: {e}")

    def _output_data_sample(self):
        """Output the current data sample to console"""
        if not self.stream_config['console_output']:
            return

        try:
            # Get latest data from each stream
            sample_data = {}
            for stream_name, stream_data in self.data_streams.items():
                if stream_data:
                    sample_data[stream_name] = list(stream_data)[-5:]  # Last 5 entries

            if sample_data:
                if self.stream_config['compact_format']:
                    self._output_compact_format(sample_data)
                else:
                    self._output_detailed_format(sample_data)

        except Exception as e:
            logger.error(f"Error outputting data sample: {e}")

    def _output_compact_format(self, sample_data: Dict):
        """Output data in compact JSON format"""
        try:
            # Create compact summary
            summary = {
                'timestamp': datetime.now().isoformat(),
                'ohlcv_count': len(sample_data.get('ohlcv_1m', [])),
                'ticks_count': len(sample_data.get('ticks', [])),
                'cob_count': len(sample_data.get('cob_raw', [])),
                'predictions_count': len(sample_data.get('predictions', [])),
                'experiences_count': len(sample_data.get('training_experiences', []))
            }

            # Add latest OHLCV if available
            if sample_data.get('ohlcv_1m'):
                latest_ohlcv = sample_data['ohlcv_1m'][-1]
                summary['price'] = latest_ohlcv['close']
                summary['volume'] = latest_ohlcv['volume']

            # Add latest COB if available
            if sample_data.get('cob_raw'):
                latest_cob = sample_data['cob_raw'][-1]
                summary['imbalance'] = latest_cob['imbalance']
                summary['spread_bps'] = latest_cob['spread_bps']

            stream_logger.info(f"DATA_STREAM: {json.dumps(summary, separators=(',', ':'))}")

        except Exception as e:
            logger.error(f"Error in compact output: {e}")

    def _output_detailed_format(self, sample_data: Dict):
        """Output data in detailed human-readable format"""
        try:
            stream_logger.info(f"{'='*80}")
            stream_logger.info(f"DATA STREAM SAMPLE - {datetime.now().strftime('%H:%M:%S')}")
            stream_logger.info(f"{'='*80}")

            # OHLCV Data
            if sample_data.get('ohlcv_1m'):
                latest = sample_data['ohlcv_1m'][-1]
                stream_logger.info(f"OHLCV (1m): {latest['symbol']} | O:{latest['open']:.2f} H:{latest['high']:.2f} L:{latest['low']:.2f} C:{latest['close']:.2f} V:{latest['volume']:.1f}")

            # Tick Data
            if sample_data.get('ticks'):
                latest_tick = sample_data['ticks'][-1]
                stream_logger.info(f"TICK: {latest_tick['symbol']} | Price:{latest_tick['price']:.2f} Vol:{latest_tick['volume']:.4f} Side:{latest_tick['side']}")

            # COB Data
            if sample_data.get('cob_raw'):
                latest_cob = sample_data['cob_raw'][-1]
                stream_logger.info(f"COB: {latest_cob['symbol']} | Imbalance:{latest_cob['imbalance']:.3f} Spread:{latest_cob['spread_bps']:.1f}bps Mid:{latest_cob['mid_price']:.2f}")

            # Model States
            if sample_data.get('model_states'):
                latest_state = sample_data['model_states'][-1]
                models = latest_state.get('models', {})
                if 'dqn' in models:
                    dqn_state = models['dqn']
                    state_vec = dqn_state.get('state_vector', [])
                    stream_logger.info(f"DQN State: {len(state_vec)} features | Price:{state_vec[0]*10000:.2f} if state_vec else 'No state'")

            # Predictions
            if sample_data.get('predictions'):
                latest_preds = sample_data['predictions'][-1]
                for model_name, preds in latest_preds.get('predictions', {}).items():
                    if preds:
                        latest_pred = preds[-1]
                        action = latest_pred.get('action', 'N/A')
                        conf = latest_pred.get('confidence', 0)
                        stream_logger.info(f"{model_name.upper()} Prediction: {action} (conf:{conf:.2f})")

            # Training Experiences
            if sample_data.get('training_experiences'):
                latest_exp = sample_data['training_experiences'][-1]
                reward = latest_exp.get('reward', 0)
                action = latest_exp.get('action', 'N/A')
                done = latest_exp.get('done', False)
                stream_logger.info(f"Training Exp: Action:{action} Reward:{reward:.4f} Done:{done}")

            stream_logger.info(f"{'='*80}")

        except Exception as e:
            logger.error(f"Error in detailed output: {e}")

    def get_stream_snapshot(self) -> Dict[str, List]:
        """Get a complete snapshot of all data streams"""
        return {stream_name: list(stream_data) for stream_name, stream_data in self.data_streams.items()}

    def save_snapshot(self, filepath: str):
        """Save current data streams to file"""
        try:
            snapshot = self.get_stream_snapshot()
            snapshot['metadata'] = {
                'timestamp': datetime.now().isoformat(),
                'config': self.stream_config
            }

            with open(filepath, 'w') as f:
                json.dump(snapshot, f, indent=2, default=str)

            logger.info(f"Data stream snapshot saved to {filepath}")

        except Exception as e:
            logger.error(f"Error saving snapshot: {e}")

    def load_snapshot(self, filepath: str):
        """Load data streams from file"""
        try:
            with open(filepath, 'r') as f:
                snapshot = json.load(f)

            for stream_name, data in snapshot.items():
                if stream_name in self.data_streams and stream_name != 'metadata':
                    self.data_streams[stream_name].clear()
                    self.data_streams[stream_name].extend(data)

            logger.info(f"Data stream snapshot loaded from {filepath}")

        except Exception as e:
            logger.error(f"Error loading snapshot: {e}")


# Global instance for easy access
_data_stream_monitor = None

def get_data_stream_monitor(orchestrator=None, data_provider=None, training_system=None) -> DataStreamMonitor:
    """Get or create the global data stream monitor instance"""
    global _data_stream_monitor
    if _data_stream_monitor is None:
        _data_stream_monitor = DataStreamMonitor(orchestrator, data_provider, training_system)
    elif orchestrator is not None or data_provider is not None or training_system is not None:
        # Update existing instance with new connections if provided
        if orchestrator is not None:
            _data_stream_monitor.orchestrator = orchestrator
        if data_provider is not None:
            _data_stream_monitor.data_provider = data_provider
        if training_system is not None:
            _data_stream_monitor.training_system = training_system
        logger.info("Updated existing DataStreamMonitor with new connections")
    return _data_stream_monitor