gogo2/core/unified_data_validator.py

"""
Data Validator for unified storage system.
Validates all incoming data before storage to ensure data integrity.
"""

import logging
from datetime import datetime, timezone
from typing import Dict, List, Tuple, Optional, Any
import pandas as pd

from .unified_data_models import OHLCVCandle, OrderBookDataFrame, TradeEvent

logger = logging.getLogger(__name__)


class DataValidator:
    """
    Validates all incoming data before storage.
    Ensures data integrity and consistency across the system.
    """

    # Validation thresholds
    MAX_PRICE_CHANGE_PERCENT = 50.0  # 50% max price change per candle
    MIN_PRICE = 0.0001  # Minimum valid price
    MAX_PRICE = 1000000.0  # Maximum valid price
    MIN_VOLUME = 0.0  # Minimum valid volume
    MAX_SPREAD_PERCENT = 10.0  # 10% max spread

    @staticmethod
    def validate_ohlcv(candle: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
        """
        Validate OHLCV candle data.

        Args:
            candle: Dictionary with OHLCV data

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Check required fields
            required_fields = ['timestamp', 'symbol', 'timeframe', 'open_price',
                             'high_price', 'low_price', 'close_price', 'volume']

            for field in required_fields:
                if field not in candle:
                    return False, f"Missing required field: {field}"

            # Extract values
            open_price = float(candle['open_price'])
            high_price = float(candle['high_price'])
            low_price = float(candle['low_price'])
            close_price = float(candle['close_price'])
            volume = float(candle['volume'])

            # Validate price ranges
            prices = [open_price, high_price, low_price, close_price]
            for price in prices:
                if price < DataValidator.MIN_PRICE:
                    return False, f"Price below minimum: {price}"
                if price > DataValidator.MAX_PRICE:
                    return False, f"Price above maximum: {price}"

            # Validate OHLC relationships
            if high_price < low_price:
                return False, f"High ({high_price}) < Low ({low_price})"

            if high_price < open_price:
                return False, f"High ({high_price}) < Open ({open_price})"

            if high_price < close_price:
                return False, f"High ({high_price}) < Close ({close_price})"

            if low_price > open_price:
                return False, f"Low ({low_price}) > Open ({open_price})"

            if low_price > close_price:
                return False, f"Low ({low_price}) > Close ({close_price})"

            # Validate volume
            if volume < DataValidator.MIN_VOLUME:
                return False, f"Volume below minimum: {volume}"

            # Validate price change (prevent extreme outliers)
            if open_price > 0:
                price_change_percent = abs((close_price - open_price) / open_price) * 100
                if price_change_percent > DataValidator.MAX_PRICE_CHANGE_PERCENT:
                    return False, f"Price change too large: {price_change_percent:.2f}%"

            # Validate timestamp
            if not DataValidator._validate_timestamp(candle['timestamp']):
                return False, "Invalid timestamp"

            # Validate symbol
            if not DataValidator._validate_symbol(candle['symbol']):
                return False, f"Invalid symbol: {candle['symbol']}"

            # Validate timeframe
            if not DataValidator._validate_timeframe(candle['timeframe']):
                return False, f"Invalid timeframe: {candle['timeframe']}"

            return True, None

        except (ValueError, TypeError, KeyError) as e:
            return False, f"Validation error: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error validating OHLCV: {e}")
            return False, f"Unexpected error: {str(e)}"

    @staticmethod
    def validate_ohlcv_candle(candle: OHLCVCandle) -> Tuple[bool, Optional[str]]:
        """
        Validate OHLCVCandle object.

        Args:
            candle: OHLCVCandle object

        Returns:
            Tuple of (is_valid, error_message)
        """
        candle_dict = {
            'timestamp': candle.timestamp,
            'symbol': candle.symbol,
            'timeframe': candle.timeframe,
            'open_price': candle.open_price,
            'high_price': candle.high_price,
            'low_price': candle.low_price,
            'close_price': candle.close_price,
            'volume': candle.volume
        }
        return DataValidator.validate_ohlcv(candle_dict)

    @staticmethod
    def validate_orderbook(orderbook: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
        """
        Validate order book data.

        Args:
            orderbook: Dictionary with order book data

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Check required fields
            if 'bids' not in orderbook or 'asks' not in orderbook:
                return False, "Missing bids or asks"

            bids = orderbook['bids']
            asks = orderbook['asks']

            # Check if bids and asks are lists
            if not isinstance(bids, list) or not isinstance(asks, list):
                return False, "Bids and asks must be lists"

            # Check if bids and asks are not empty
            if not bids or not asks:
                return False, "Bids or asks are empty"

            # Validate bid levels
            for bid in bids:
                if not isinstance(bid, (list, tuple)) or len(bid) < 2:
                    return False, "Invalid bid format"

                price, size = float(bid[0]), float(bid[1])

                if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
                    return False, f"Invalid bid price: {price}"

                if size <= 0:
                    return False, f"Invalid bid size: {size}"

            # Validate ask levels
            for ask in asks:
                if not isinstance(ask, (list, tuple)) or len(ask) < 2:
                    return False, "Invalid ask format"

                price, size = float(ask[0]), float(ask[1])

                if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
                    return False, f"Invalid ask price: {price}"

                if size <= 0:
                    return False, f"Invalid ask size: {size}"

            # Validate bid/ask relationship
            best_bid = max(float(bid[0]) for bid in bids)
            best_ask = min(float(ask[0]) for ask in asks)

            if best_bid >= best_ask:
                return False, f"Best bid ({best_bid}) >= Best ask ({best_ask})"

            # Validate spread
            spread = best_ask - best_bid
            mid_price = (best_bid + best_ask) / 2
            spread_percent = (spread / mid_price) * 100

            if spread_percent > DataValidator.MAX_SPREAD_PERCENT:
                return False, f"Spread too large: {spread_percent:.2f}%"

            # Validate timestamp if present
            if 'timestamp' in orderbook:
                if not DataValidator._validate_timestamp(orderbook['timestamp']):
                    return False, "Invalid timestamp"

            # Validate symbol if present
            if 'symbol' in orderbook:
                if not DataValidator._validate_symbol(orderbook['symbol']):
                    return False, f"Invalid symbol: {orderbook['symbol']}"

            return True, None

        except (ValueError, TypeError, KeyError) as e:
            return False, f"Validation error: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error validating orderbook: {e}")
            return False, f"Unexpected error: {str(e)}"

    @staticmethod
    def validate_orderbook_dataframe(orderbook: OrderBookDataFrame) -> Tuple[bool, Optional[str]]:
        """
        Validate OrderBookDataFrame object.

        Args:
            orderbook: OrderBookDataFrame object

        Returns:
            Tuple of (is_valid, error_message)
        """
        orderbook_dict = {
            'bids': orderbook.bids,
            'asks': orderbook.asks,
            'timestamp': orderbook.timestamp,
            'symbol': orderbook.symbol
        }
        return DataValidator.validate_orderbook(orderbook_dict)

    @staticmethod
    def validate_trade(trade: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
        """
        Validate trade event data.

        Args:
            trade: Dictionary with trade data

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Check required fields
            required_fields = ['timestamp', 'symbol', 'price', 'size', 'side', 'trade_id']

            for field in required_fields:
                if field not in trade:
                    return False, f"Missing required field: {field}"

            # Validate price
            price = float(trade['price'])
            if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
                return False, f"Invalid price: {price}"

            # Validate size
            size = float(trade['size'])
            if size <= 0:
                return False, f"Invalid size: {size}"

            # Validate side
            side = trade['side'].lower()
            if side not in ['buy', 'sell', 'bid', 'ask']:
                return False, f"Invalid side: {trade['side']}"

            # Validate timestamp
            if not DataValidator._validate_timestamp(trade['timestamp']):
                return False, "Invalid timestamp"

            # Validate symbol
            if not DataValidator._validate_symbol(trade['symbol']):
                return False, f"Invalid symbol: {trade['symbol']}"

            # Validate trade_id
            if not trade['trade_id']:
                return False, "Empty trade_id"

            return True, None

        except (ValueError, TypeError, KeyError) as e:
            return False, f"Validation error: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error validating trade: {e}")
            return False, f"Unexpected error: {str(e)}"

    @staticmethod
    def validate_trade_event(trade: TradeEvent) -> Tuple[bool, Optional[str]]:
        """
        Validate TradeEvent object.

        Args:
            trade: TradeEvent object

        Returns:
            Tuple of (is_valid, error_message)
        """
        trade_dict = {
            'timestamp': trade.timestamp,
            'symbol': trade.symbol,
            'price': trade.price,
            'size': trade.size,
            'side': trade.side,
            'trade_id': trade.trade_id
        }
        return DataValidator.validate_trade(trade_dict)

    @staticmethod
    def validate_imbalances(imbalances: Dict[str, float]) -> Tuple[bool, Optional[str]]:
        """
        Validate order book imbalance metrics.

        Args:
            imbalances: Dictionary with imbalance metrics

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Expected imbalance fields
            expected_fields = [
                'imbalance_1s', 'imbalance_5s', 'imbalance_15s', 'imbalance_60s',
                'volume_imbalance_1s', 'volume_imbalance_5s',
                'volume_imbalance_15s', 'volume_imbalance_60s'
            ]

            # Check if at least some imbalance fields are present
            present_fields = [f for f in expected_fields if f in imbalances]
            if not present_fields:
                return False, "No imbalance fields present"

            # Validate imbalance values (should be between -1 and 1)
            for field in present_fields:
                value = float(imbalances[field])
                if value < -1.0 or value > 1.0:
                    logger.warning(f"Imbalance {field} out of range [-1, 1]: {value}")
                    # Don't fail validation, just warn

            return True, None

        except (ValueError, TypeError) as e:
            return False, f"Validation error: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error validating imbalances: {e}")
            return False, f"Unexpected error: {str(e)}"

    @staticmethod
    def _validate_timestamp(timestamp: Any) -> bool:
        """Validate timestamp format and value."""
        try:
            if isinstance(timestamp, datetime):
                # Check if timestamp is not too far in the future or past
                now = datetime.now(timezone.utc)
                diff_days = abs((timestamp - now).days)

                # Allow timestamps within 10 years
                if diff_days > 3650:
                    logger.warning(f"Timestamp too far from now: {timestamp}")
                    return False

                return True
            elif isinstance(timestamp, (int, float)):
                # Unix timestamp
                dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
                return DataValidator._validate_timestamp(dt)
            elif isinstance(timestamp, str):
                # ISO format string
                dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
                return DataValidator._validate_timestamp(dt)
            else:
                return False

        except Exception as e:
            logger.debug(f"Timestamp validation error: {e}")
            return False

    @staticmethod
    def _validate_symbol(symbol: str) -> bool:
        """Validate trading symbol format."""
        if not symbol or not isinstance(symbol, str):
            return False

        # Symbol should be non-empty and reasonable length
        if len(symbol) < 3 or len(symbol) > 20:
            return False

        # Common symbol formats: BTCUSDT, BTC/USDT, BTC-USDT
        valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789/-')
        if not all(c in valid_chars for c in symbol.upper()):
            return False

        return True

    @staticmethod
    def _validate_timeframe(timeframe: str) -> bool:
        """Validate timeframe format."""
        valid_timeframes = ['1s', '1m', '5m', '15m', '30m', '1h', '4h', '1d', '1w']
        return timeframe in valid_timeframes

    @staticmethod
    def validate_dataframe(df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, Optional[str]]:
        """
        Validate pandas DataFrame has required columns and valid data.

        Args:
            df: DataFrame to validate
            required_columns: List of required column names

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Check if DataFrame is empty
            if df.empty:
                return False, "DataFrame is empty"

            # Check required columns
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                return False, f"Missing columns: {missing_columns}"

            # Check for null values in required columns
            null_counts = df[required_columns].isnull().sum()
            if null_counts.any():
                null_cols = null_counts[null_counts > 0].to_dict()
                return False, f"Null values found: {null_cols}"

            return True, None

        except Exception as e:
            logger.error(f"DataFrame validation error: {e}")
            return False, f"Validation error: {str(e)}"

    @staticmethod
    def sanitize_ohlcv(candle: Dict[str, Any]) -> Dict[str, Any]:
        """
        Sanitize OHLCV data by fixing common issues.

        Args:
            candle: OHLCV candle dictionary

        Returns:
            Sanitized candle dictionary
        """
        sanitized = candle.copy()

        try:
            # Ensure numeric types
            for field in ['open_price', 'high_price', 'low_price', 'close_price', 'volume']:
                if field in sanitized:
                    sanitized[field] = float(sanitized[field])

            # Fix high/low if needed
            prices = [sanitized.get('open_price', 0), sanitized.get('close_price', 0)]
            if 'high_price' in sanitized and 'low_price' in sanitized:
                high = sanitized['high_price']
                low = sanitized['low_price']

                # Ensure high >= all prices
                sanitized['high_price'] = max(high, *prices)

                # Ensure low <= all prices
                sanitized['low_price'] = min(low, *prices)

            # Ensure non-negative volume
            if 'volume' in sanitized:
                sanitized['volume'] = max(0, sanitized['volume'])

            # Ensure trade_count is integer
            if 'trade_count' in sanitized:
                sanitized['trade_count'] = int(sanitized['trade_count'])

        except Exception as e:
            logger.error(f"Error sanitizing OHLCV: {e}")

        return sanitized

    @staticmethod
    def sanitize_orderbook(orderbook: Dict[str, Any]) -> Dict[str, Any]:
        """
        Sanitize order book data by fixing common issues.

        Args:
            orderbook: Order book dictionary

        Returns:
            Sanitized order book dictionary
        """
        sanitized = orderbook.copy()

        try:
            # Ensure bids and asks are lists
            if 'bids' in sanitized and not isinstance(sanitized['bids'], list):
                sanitized['bids'] = []

            if 'asks' in sanitized and not isinstance(sanitized['asks'], list):
                sanitized['asks'] = []

            # Remove invalid levels
            if 'bids' in sanitized:
                sanitized['bids'] = [
                    bid for bid in sanitized['bids']
                    if isinstance(bid, (list, tuple)) and len(bid) >= 2 and float(bid[0]) > 0 and float(bid[1]) > 0
                ]

            if 'asks' in sanitized:
                sanitized['asks'] = [
                    ask for ask in sanitized['asks']
                    if isinstance(ask, (list, tuple)) and len(ask) >= 2 and float(ask[0]) > 0 and float(ask[1]) > 0
                ]

            # Sort bids descending, asks ascending
            if 'bids' in sanitized:
                sanitized['bids'] = sorted(sanitized['bids'], key=lambda x: float(x[0]), reverse=True)

            if 'asks' in sanitized:
                sanitized['asks'] = sorted(sanitized['asks'], key=lambda x: float(x[0]))

        except Exception as e:
            logger.error(f"Error sanitizing orderbook: {e}")

        return sanitized