gogo2/COBY/monitoring/metrics_collector.py

"""
Comprehensive metrics collection for all system components.
"""

import time
import psutil
import threading
from typing import Dict, List, Optional, Any, Callable
from collections import defaultdict, deque
from datetime import datetime, timezone
from dataclasses import dataclass, field

from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp

logger = get_logger(__name__)


@dataclass
class MetricPoint:
    """Individual metric data point"""
    name: str
    value: float
    timestamp: datetime
    labels: Dict[str, str] = field(default_factory=dict)

    def to_prometheus_format(self) -> str:
        """Convert to Prometheus format"""
        labels_str = ""
        if self.labels:
            label_pairs = [f'{k}="{v}"' for k, v in self.labels.items()]
            labels_str = "{" + ",".join(label_pairs) + "}"

        return f"{self.name}{labels_str} {self.value} {int(self.timestamp.timestamp() * 1000)}"


@dataclass
class SystemMetrics:
    """System-level metrics"""
    cpu_usage: float
    memory_usage: float
    memory_available: float
    disk_usage: float
    network_bytes_sent: int
    network_bytes_recv: int
    active_connections: int
    timestamp: datetime


class MetricsCollector:
    """
    Collects and manages performance metrics from all system components.

    Provides Prometheus-compatible metrics and real-time monitoring data.
    """

    def __init__(self, collection_interval: float = 1.0, max_history: int = 10000):
        """
        Initialize metrics collector.

        Args:
            collection_interval: How often to collect system metrics (seconds)
            max_history: Maximum number of metric points to keep in memory
        """
        self.collection_interval = collection_interval
        self.max_history = max_history

        # Metric storage
        self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_history))
        self.counters: Dict[str, float] = defaultdict(float)
        self.gauges: Dict[str, float] = defaultdict(float)
        self.histograms: Dict[str, List[float]] = defaultdict(list)

        # System metrics
        self.system_metrics_history: deque = deque(maxlen=max_history)

        # Collection control
        self._collecting = False
        self._collection_thread: Optional[threading.Thread] = None
        self._lock = threading.RLock()

        # Callbacks for custom metrics
        self.metric_callbacks: List[Callable[[], Dict[str, float]]] = []

        logger.info(f"Metrics collector initialized with {collection_interval}s interval")

    def start_collection(self) -> None:
        """Start automatic metrics collection"""
        if self._collecting:
            logger.warning("Metrics collection already running")
            return

        self._collecting = True
        self._collection_thread = threading.Thread(
            target=self._collection_loop,
            name="MetricsCollector",
            daemon=True
        )
        self._collection_thread.start()
        logger.info("Started metrics collection")

    def stop_collection(self) -> None:
        """Stop automatic metrics collection"""
        if not self._collecting:
            return

        self._collecting = False
        if self._collection_thread:
            self._collection_thread.join(timeout=5.0)
        logger.info("Stopped metrics collection")

    def _collection_loop(self) -> None:
        """Main collection loop"""
        while self._collecting:
            try:
                # Collect system metrics
                self._collect_system_metrics()

                # Collect custom metrics from callbacks
                self._collect_custom_metrics()

                time.sleep(self.collection_interval)

            except Exception as e:
                logger.error(f"Error in metrics collection loop: {e}")
                time.sleep(self.collection_interval)

    def _collect_system_metrics(self) -> None:
        """Collect system-level metrics"""
        try:
            # CPU usage
            cpu_percent = psutil.cpu_percent(interval=None)

            # Memory usage
            memory = psutil.virtual_memory()

            # Disk usage (root partition)
            disk = psutil.disk_usage('/')

            # Network stats
            network = psutil.net_io_counters()

            # Active connections
            connections = len(psutil.net_connections())

            # Create system metrics object
            sys_metrics = SystemMetrics(
                cpu_usage=cpu_percent,
                memory_usage=memory.percent,
                memory_available=memory.available / (1024**3),  # GB
                disk_usage=disk.percent,
                network_bytes_sent=network.bytes_sent,
                network_bytes_recv=network.bytes_recv,
                active_connections=connections,
                timestamp=get_current_timestamp()
            )

            with self._lock:
                self.system_metrics_history.append(sys_metrics)

                # Update gauges
                self.gauges['system_cpu_usage'] = cpu_percent
                self.gauges['system_memory_usage'] = memory.percent
                self.gauges['system_memory_available_gb'] = memory.available / (1024**3)
                self.gauges['system_disk_usage'] = disk.percent
                self.gauges['system_active_connections'] = connections

                # Update counters (cumulative)
                self.counters['system_network_bytes_sent'] = network.bytes_sent
                self.counters['system_network_bytes_recv'] = network.bytes_recv

        except Exception as e:
            logger.error(f"Error collecting system metrics: {e}")

    def _collect_custom_metrics(self) -> None:
        """Collect metrics from registered callbacks"""
        for callback in self.metric_callbacks:
            try:
                custom_metrics = callback()
                if isinstance(custom_metrics, dict):
                    with self._lock:
                        for name, value in custom_metrics.items():
                            self.record_gauge(name, value)
            except Exception as e:
                logger.error(f"Error collecting custom metrics: {e}")

    def record_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None) -> None:
        """Record a counter metric (cumulative)"""
        with self._lock:
            self.counters[name] += value

            # Store metric point
            point = MetricPoint(
                name=name,
                value=self.counters[name],
                timestamp=get_current_timestamp(),
                labels=labels or {}
            )
            self.metrics[name].append(point)

    def record_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
        """Record a gauge metric (current value)"""
        with self._lock:
            self.gauges[name] = value

            # Store metric point
            point = MetricPoint(
                name=name,
                value=value,
                timestamp=get_current_timestamp(),
                labels=labels or {}
            )
            self.metrics[name].append(point)

    def record_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
        """Record a histogram metric (for latency, sizes, etc.)"""
        with self._lock:
            self.histograms[name].append(value)

            # Keep only recent values
            if len(self.histograms[name]) > 1000:
                self.histograms[name] = self.histograms[name][-1000:]

            # Store metric point
            point = MetricPoint(
                name=name,
                value=value,
                timestamp=get_current_timestamp(),
                labels=labels or {}
            )
            self.metrics[name].append(point)

    def increment_counter(self, name: str, labels: Dict[str, str] = None) -> None:
        """Increment a counter by 1"""
        self.record_counter(name, 1.0, labels)

    def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
        """Set a gauge value"""
        self.record_gauge(name, value, labels)

    def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
        """Observe a value in a histogram"""
        self.record_histogram(name, value, labels)

    def get_current_metrics(self) -> Dict[str, Any]:
        """Get current metric values"""
        with self._lock:
            return {
                'counters': dict(self.counters),
                'gauges': dict(self.gauges),
                'histograms': {
                    name: {
                        'count': len(values),
                        'sum': sum(values),
                        'avg': sum(values) / len(values) if values else 0,
                        'min': min(values) if values else 0,
                        'max': max(values) if values else 0,
                        'p50': self._percentile(values, 50) if values else 0,
                        'p95': self._percentile(values, 95) if values else 0,
                        'p99': self._percentile(values, 99) if values else 0
                    }
                    for name, values in self.histograms.items()
                },
                'system': self.get_latest_system_metrics()
            }

    def get_latest_system_metrics(self) -> Optional[Dict[str, Any]]:
        """Get the latest system metrics"""
        with self._lock:
            if not self.system_metrics_history:
                return None

            latest = self.system_metrics_history[-1]
            return {
                'cpu_usage': latest.cpu_usage,
                'memory_usage': latest.memory_usage,
                'memory_available_gb': latest.memory_available,
                'disk_usage': latest.disk_usage,
                'network_bytes_sent': latest.network_bytes_sent,
                'network_bytes_recv': latest.network_bytes_recv,
                'active_connections': latest.active_connections,
                'timestamp': latest.timestamp.isoformat()
            }

    def get_metric_history(self, name: str, limit: int = 100) -> List[Dict[str, Any]]:
        """Get historical values for a specific metric"""
        with self._lock:
            if name not in self.metrics:
                return []

            points = list(self.metrics[name])[-limit:]
            return [
                {
                    'value': point.value,
                    'timestamp': point.timestamp.isoformat(),
                    'labels': point.labels
                }
                for point in points
            ]

    def get_prometheus_metrics(self) -> str:
        """Export metrics in Prometheus format"""
        lines = []

        with self._lock:
            # Export counters
            for name, value in self.counters.items():
                lines.append(f"# TYPE {name} counter")
                lines.append(f"{name} {value}")

            # Export gauges
            for name, value in self.gauges.items():
                lines.append(f"# TYPE {name} gauge")
                lines.append(f"{name} {value}")

            # Export histograms
            for name, values in self.histograms.items():
                if values:
                    lines.append(f"# TYPE {name} histogram")
                    lines.append(f"{name}_count {len(values)}")
                    lines.append(f"{name}_sum {sum(values)}")

                    # Add percentiles
                    for percentile in [50, 95, 99]:
                        p_value = self._percentile(values, percentile)
                        lines.append(f"{name}_percentile{{quantile=\"0.{percentile:02d}\"}} {p_value}")

        return "\n".join(lines)

    def register_callback(self, callback: Callable[[], Dict[str, float]]) -> None:
        """Register a callback for custom metrics collection"""
        self.metric_callbacks.append(callback)
        logger.info(f"Registered metrics callback: {callback.__name__}")

    def get_performance_summary(self) -> Dict[str, Any]:
        """Get a performance summary"""
        current_metrics = self.get_current_metrics()

        # Calculate rates and trends
        summary = {
            'timestamp': get_current_timestamp().isoformat(),
            'system': current_metrics.get('system', {}),
            'counters': current_metrics.get('counters', {}),
            'gauges': current_metrics.get('gauges', {}),
            'performance_indicators': {}
        }

        # Add performance indicators
        histograms = current_metrics.get('histograms', {})
        for name, stats in histograms.items():
            if 'latency' in name.lower():
                summary['performance_indicators'][f"{name}_avg_ms"] = stats['avg']
                summary['performance_indicators'][f"{name}_p95_ms"] = stats['p95']

        return summary

    def _percentile(self, values: List[float], percentile: int) -> float:
        """Calculate percentile of values"""
        if not values:
            return 0.0

        sorted_values = sorted(values)
        index = int((percentile / 100.0) * len(sorted_values))
        index = min(index, len(sorted_values) - 1)
        return sorted_values[index]

    def reset_metrics(self) -> None:
        """Reset all metrics (useful for testing)"""
        with self._lock:
            self.metrics.clear()
            self.counters.clear()
            self.gauges.clear()
            self.histograms.clear()
            self.system_metrics_history.clear()

        logger.info("All metrics reset")

    def get_stats(self) -> Dict[str, Any]:
        """Get collector statistics"""
        with self._lock:
            return {
                'collecting': self._collecting,
                'collection_interval': self.collection_interval,
                'max_history': self.max_history,
                'total_metrics': len(self.metrics),
                'total_counters': len(self.counters),
                'total_gauges': len(self.gauges),
                'total_histograms': len(self.histograms),
                'system_metrics_count': len(self.system_metrics_history),
                'registered_callbacks': len(self.metric_callbacks)
            }


# Global metrics collector instance
metrics_collector = MetricsCollector()