gogo2/COBY/monitoring/performance_monitor.py

"""
Performance monitoring dashboard and real-time performance tracking.
"""

import time
import asyncio
import threading
from typing import Dict, List, Optional, Any, Callable
from collections import defaultdict, deque
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, field

from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
from .metrics_collector import MetricsCollector

logger = get_logger(__name__)


@dataclass
class PerformanceAlert:
    """Performance alert definition"""
    name: str
    metric_name: str
    threshold: float
    comparison: str  # 'gt', 'lt', 'eq'
    duration: int  # seconds
    message: str
    severity: str = 'warning'  # 'info', 'warning', 'critical'
    triggered_at: Optional[datetime] = None
    resolved_at: Optional[datetime] = None

    def is_triggered(self, value: float) -> bool:
        """Check if alert should be triggered"""
        if self.comparison == 'gt':
            return value > self.threshold
        elif self.comparison == 'lt':
            return value < self.threshold
        elif self.comparison == 'eq':
            return abs(value - self.threshold) < 0.001
        return False


@dataclass
class PerformanceThresholds:
    """Performance threshold configuration"""
    max_cpu_usage: float = 80.0  # %
    max_memory_usage: float = 85.0  # %
    min_memory_available: float = 1.0  # GB
    max_latency_ms: float = 100.0  # milliseconds
    max_error_rate: float = 5.0  # %
    min_throughput: float = 100.0  # operations/second


class PerformanceMonitor:
    """
    Real-time performance monitoring with alerting and dashboard data.

    Monitors system performance, tracks KPIs, and provides alerts.
    """

    def __init__(self, metrics_collector: MetricsCollector = None):
        """
        Initialize performance monitor.

        Args:
            metrics_collector: Metrics collector instance
        """
        if metrics_collector is None:
            from .metrics_collector import metrics_collector as default_collector
            self.metrics_collector = default_collector
        else:
            self.metrics_collector = metrics_collector
        self.thresholds = PerformanceThresholds()

        # Alert management
        self.alerts: Dict[str, PerformanceAlert] = {}
        self.active_alerts: Dict[str, PerformanceAlert] = {}
        self.alert_history: deque = deque(maxlen=1000)

        # Performance tracking
        self.performance_history: deque = deque(maxlen=10000)
        self.kpi_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))

        # Monitoring control
        self._monitoring = False
        self._monitor_thread: Optional[threading.Thread] = None
        self._monitor_interval = 5.0  # seconds

        # Alert callbacks
        self.alert_callbacks: List[Callable[[PerformanceAlert], None]] = []

        # Initialize default alerts
        self._setup_default_alerts()

        logger.info("Performance monitor initialized")

    def _setup_default_alerts(self) -> None:
        """Setup default performance alerts"""
        default_alerts = [
            PerformanceAlert(
                name="high_cpu_usage",
                metric_name="system_cpu_usage",
                threshold=self.thresholds.max_cpu_usage,
                comparison="gt",
                duration=30,
                message="CPU usage is above {threshold}%",
                severity="warning"
            ),
            PerformanceAlert(
                name="high_memory_usage",
                metric_name="system_memory_usage",
                threshold=self.thresholds.max_memory_usage,
                comparison="gt",
                duration=30,
                message="Memory usage is above {threshold}%",
                severity="warning"
            ),
            PerformanceAlert(
                name="low_memory_available",
                metric_name="system_memory_available_gb",
                threshold=self.thresholds.min_memory_available,
                comparison="lt",
                duration=60,
                message="Available memory is below {threshold}GB",
                severity="critical"
            ),
            PerformanceAlert(
                name="high_latency",
                metric_name="processing_latency_ms",
                threshold=self.thresholds.max_latency_ms,
                comparison="gt",
                duration=60,
                message="Processing latency is above {threshold}ms",
                severity="warning"
            )
        ]

        for alert in default_alerts:
            self.add_alert(alert)

    def start_monitoring(self) -> None:
        """Start performance monitoring"""
        if self._monitoring:
            logger.warning("Performance monitoring already running")
            return

        self._monitoring = True
        self._monitor_thread = threading.Thread(
            target=self._monitoring_loop,
            name="PerformanceMonitor",
            daemon=True
        )
        self._monitor_thread.start()
        logger.info("Started performance monitoring")

    def stop_monitoring(self) -> None:
        """Stop performance monitoring"""
        if not self._monitoring:
            return

        self._monitoring = False
        if self._monitor_thread:
            self._monitor_thread.join(timeout=5.0)
        logger.info("Stopped performance monitoring")

    def _monitoring_loop(self) -> None:
        """Main monitoring loop"""
        while self._monitoring:
            try:
                # Collect current performance data
                self._collect_performance_data()

                # Check alerts
                self._check_alerts()

                # Update KPIs
                self._update_kpis()

                time.sleep(self._monitor_interval)

            except Exception as e:
                logger.error(f"Error in performance monitoring loop: {e}")
                time.sleep(self._monitor_interval)

    def _collect_performance_data(self) -> None:
        """Collect current performance data"""
        try:
            if self.metrics_collector is None:
                return

            current_metrics = self.metrics_collector.get_current_metrics()

            if current_metrics is None:
                return

            # Create performance snapshot
            performance_data = {
                'timestamp': get_current_timestamp(),
                'system': current_metrics.get('system', {}),
                'counters': current_metrics.get('counters', {}),
                'gauges': current_metrics.get('gauges', {}),
                'histograms': current_metrics.get('histograms', {})
            }

            self.performance_history.append(performance_data)

        except Exception as e:
            logger.error(f"Error collecting performance data: {e}")

    def _check_alerts(self) -> None:
        """Check all alerts against current metrics"""
        if self.metrics_collector is None:
            return

        current_metrics = self.metrics_collector.get_current_metrics()
        if current_metrics is None:
            return

        current_time = get_current_timestamp()

        for alert_name, alert in self.alerts.items():
            try:
                # Get metric value
                metric_value = self._get_metric_value(alert.metric_name, current_metrics)
                if metric_value is None:
                    continue

                # Check if alert should be triggered
                should_trigger = alert.is_triggered(metric_value)

                if should_trigger and alert_name not in self.active_alerts:
                    # Trigger alert
                    alert.triggered_at = current_time
                    self.active_alerts[alert_name] = alert
                    self.alert_history.append(alert)

                    # Format message
                    message = alert.message.format(
                        threshold=alert.threshold,
                        value=metric_value
                    )

                    logger.warning(f"Performance alert triggered: {alert.name} - {message}")

                    # Notify callbacks
                    for callback in self.alert_callbacks:
                        try:
                            callback(alert)
                        except Exception as e:
                            logger.error(f"Error in alert callback: {e}")

                elif not should_trigger and alert_name in self.active_alerts:
                    # Resolve alert
                    resolved_alert = self.active_alerts.pop(alert_name)
                    resolved_alert.resolved_at = current_time

                    logger.info(f"Performance alert resolved: {alert.name}")

            except Exception as e:
                logger.error(f"Error checking alert {alert_name}: {e}")

    def _get_metric_value(self, metric_name: str, metrics: Dict[str, Any]) -> Optional[float]:
        """Get metric value from metrics data"""
        if not metrics:
            return None

        # Check gauges first
        gauges = metrics.get('gauges', {})
        if gauges and metric_name in gauges:
            return gauges[metric_name]

        # Check counters
        counters = metrics.get('counters', {})
        if counters and metric_name in counters:
            return counters[metric_name]

        # Check histograms (use average)
        histograms = metrics.get('histograms', {})
        if histograms and metric_name in histograms:
            hist_data = histograms[metric_name]
            if hist_data and isinstance(hist_data, dict):
                return hist_data.get('avg', 0)

        # Check system metrics
        system_metrics = metrics.get('system', {})
        if system_metrics and metric_name in system_metrics:
            return system_metrics[metric_name]

        return None

    def _update_kpis(self) -> None:
        """Update key performance indicators"""
        try:
            if self.metrics_collector is None:
                return

            current_metrics = self.metrics_collector.get_current_metrics()
            if current_metrics is None:
                return

            timestamp = get_current_timestamp()

            # Calculate throughput (operations per second)
            throughput = self._calculate_throughput()
            self.kpi_history['throughput_ops_per_sec'].append({
                'value': throughput,
                'timestamp': timestamp
            })

            # Calculate error rate
            error_rate = self._calculate_error_rate()
            self.kpi_history['error_rate_percent'].append({
                'value': error_rate,
                'timestamp': timestamp
            })

            # Calculate average latency
            avg_latency = self._calculate_average_latency()
            self.kpi_history['avg_latency_ms'].append({
                'value': avg_latency,
                'timestamp': timestamp
            })

            # Update metrics collector with KPIs
            self.metrics_collector.set_gauge('kpi_throughput_ops_per_sec', throughput)
            self.metrics_collector.set_gauge('kpi_error_rate_percent', error_rate)
            self.metrics_collector.set_gauge('kpi_avg_latency_ms', avg_latency)

        except Exception as e:
            logger.error(f"Error updating KPIs: {e}")

    def _calculate_throughput(self) -> float:
        """Calculate operations per second throughput"""
        try:
            current_metrics = self.metrics_collector.get_current_metrics()
            counters = current_metrics.get('counters', {})

            # Sum up relevant operation counters
            total_ops = 0
            for name, value in counters.items():
                if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']):
                    total_ops += value

            # Calculate rate (simple approximation)
            if len(self.performance_history) >= 2:
                prev_data = self.performance_history[-2]
                current_data = self.performance_history[-1]

                time_diff = (current_data['timestamp'] - prev_data['timestamp']).total_seconds()
                if time_diff > 0:
                    prev_ops = sum(
                        value for name, value in prev_data.get('counters', {}).items()
                        if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed'])
                    )
                    return (total_ops - prev_ops) / time_diff

            return 0.0

        except Exception as e:
            logger.error(f"Error calculating throughput: {e}")
            return 0.0

    def _calculate_error_rate(self) -> float:
        """Calculate error rate percentage"""
        try:
            current_metrics = self.metrics_collector.get_current_metrics()
            counters = current_metrics.get('counters', {})

            # Count errors and total operations
            total_errors = sum(
                value for name, value in counters.items()
                if 'error' in name.lower() or 'failed' in name.lower()
            )

            total_operations = sum(
                value for name, value in counters.items()
                if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed', 'total'])
            )

            if total_operations > 0:
                return (total_errors / total_operations) * 100

            return 0.0

        except Exception as e:
            logger.error(f"Error calculating error rate: {e}")
            return 0.0

    def _calculate_average_latency(self) -> float:
        """Calculate average latency across all operations"""
        try:
            current_metrics = self.metrics_collector.get_current_metrics()
            histograms = current_metrics.get('histograms', {})

            # Find latency histograms
            latency_values = []
            for name, stats in histograms.items():
                if 'latency' in name.lower():
                    latency_values.append(stats.get('avg', 0))

            if latency_values:
                return sum(latency_values) / len(latency_values)

            return 0.0

        except Exception as e:
            logger.error(f"Error calculating average latency: {e}")
            return 0.0

    def add_alert(self, alert: PerformanceAlert) -> None:
        """Add a performance alert"""
        self.alerts[alert.name] = alert
        logger.info(f"Added performance alert: {alert.name}")

    def remove_alert(self, alert_name: str) -> None:
        """Remove a performance alert"""
        if alert_name in self.alerts:
            del self.alerts[alert_name]
            # Also remove from active alerts if present
            self.active_alerts.pop(alert_name, None)
            logger.info(f"Removed performance alert: {alert_name}")

    def get_active_alerts(self) -> List[PerformanceAlert]:
        """Get currently active alerts"""
        return list(self.active_alerts.values())

    def get_alert_history(self, limit: int = 100) -> List[PerformanceAlert]:
        """Get alert history"""
        return list(self.alert_history)[-limit:]

    def get_performance_dashboard_data(self) -> Dict[str, Any]:
        """Get data for performance dashboard"""
        current_metrics = {}
        if self.metrics_collector:
            current_metrics = self.metrics_collector.get_current_metrics() or {}

        system_metrics = current_metrics.get('system', {}) or {}

        return {
            'timestamp': get_current_timestamp().isoformat(),
            'system_metrics': system_metrics,
            'kpis': {
                name: list(history)[-10:] if history else []  # Last 10 points
                for name, history in self.kpi_history.items()
            },
            'active_alerts': [
                {
                    'name': alert.name,
                    'message': alert.message,
                    'severity': alert.severity.value if hasattr(alert.severity, 'value') else str(alert.severity),
                    'triggered_at': alert.triggered_at.isoformat() if alert.triggered_at else None
                }
                for alert in self.active_alerts.values()
            ],
            'performance_summary': {
                'cpu_usage': system_metrics.get('cpu_usage', 0) if system_metrics else 0,
                'memory_usage': system_metrics.get('memory_usage', 0) if system_metrics else 0,
                'active_connections': system_metrics.get('active_connections', 0) if system_metrics else 0,
                'throughput': self.kpi_history['throughput_ops_per_sec'][-1]['value'] if self.kpi_history['throughput_ops_per_sec'] else 0,
                'error_rate': self.kpi_history['error_rate_percent'][-1]['value'] if self.kpi_history['error_rate_percent'] else 0,
                'avg_latency': self.kpi_history['avg_latency_ms'][-1]['value'] if self.kpi_history['avg_latency_ms'] else 0
            }
        }

    def register_alert_callback(self, callback: Callable[[PerformanceAlert], None]) -> None:
        """Register callback for alert notifications"""
        self.alert_callbacks.append(callback)
        logger.info(f"Registered alert callback: {callback.__name__}")

    def update_thresholds(self, **kwargs) -> None:
        """Update performance thresholds"""
        for key, value in kwargs.items():
            if hasattr(self.thresholds, key):
                setattr(self.thresholds, key, value)
                logger.info(f"Updated threshold {key} to {value}")

    def get_performance_trends(self, hours: int = 24) -> Dict[str, Any]:
        """Get performance trends over specified time period"""
        cutoff_time = get_current_timestamp() - timedelta(hours=hours)

        # Filter performance history
        recent_data = [
            data for data in self.performance_history
            if data and data.get('timestamp') and data['timestamp'] >= cutoff_time
        ]

        if not recent_data:
            return {}

        # Calculate trends
        trends = {}

        # CPU usage trend
        cpu_values = []
        for data in recent_data:
            system_data = data.get('system', {})
            if system_data:
                cpu_values.append(system_data.get('cpu_usage', 0))

        if cpu_values:
            trends['cpu_usage'] = {
                'current': cpu_values[-1],
                'average': sum(cpu_values) / len(cpu_values),
                'max': max(cpu_values),
                'trend': 'increasing' if len(cpu_values) > 1 and cpu_values[-1] > cpu_values[0] else 'stable'
            }

        # Memory usage trend
        memory_values = []
        for data in recent_data:
            system_data = data.get('system', {})
            if system_data:
                memory_values.append(system_data.get('memory_usage', 0))

        if memory_values:
            trends['memory_usage'] = {
                'current': memory_values[-1],
                'average': sum(memory_values) / len(memory_values),
                'max': max(memory_values),
                'trend': 'increasing' if len(memory_values) > 1 and memory_values[-1] > memory_values[0] else 'stable'
            }

        return trends

    def get_stats(self) -> Dict[str, Any]:
        """Get performance monitor statistics"""
        return {
            'monitoring': self._monitoring,
            'monitor_interval': self._monitor_interval,
            'total_alerts': len(self.alerts),
            'active_alerts': len(self.active_alerts),
            'alert_history_count': len(self.alert_history),
            'performance_history_count': len(self.performance_history),
            'kpi_metrics': list(self.kpi_history.keys()),
            'registered_callbacks': len(self.alert_callbacks),
            'thresholds': {
                'max_cpu_usage': self.thresholds.max_cpu_usage,
                'max_memory_usage': self.thresholds.max_memory_usage,
                'min_memory_available': self.thresholds.min_memory_available,
                'max_latency_ms': self.thresholds.max_latency_ms,
                'max_error_rate': self.thresholds.max_error_rate,
                'min_throughput': self.thresholds.min_throughput
            }
        }


# Global performance monitor instance (initialized lazily)
performance_monitor = None

def get_performance_monitor():
    """Get or create global performance monitor instance"""
    global performance_monitor
    if performance_monitor is None:
        performance_monitor = PerformanceMonitor()
    return performance_monitor