""" Comprehensive metrics collection for all system components. """ import time import psutil import threading from typing import Dict, List, Optional, Any, Callable from collections import defaultdict, deque from datetime import datetime, timezone from dataclasses import dataclass, field from ..utils.logging import get_logger from ..utils.timing import get_current_timestamp logger = get_logger(__name__) @dataclass class MetricPoint: """Individual metric data point""" name: str value: float timestamp: datetime labels: Dict[str, str] = field(default_factory=dict) def to_prometheus_format(self) -> str: """Convert to Prometheus format""" labels_str = "" if self.labels: label_pairs = [f'{k}="{v}"' for k, v in self.labels.items()] labels_str = "{" + ",".join(label_pairs) + "}" return f"{self.name}{labels_str} {self.value} {int(self.timestamp.timestamp() * 1000)}" @dataclass class SystemMetrics: """System-level metrics""" cpu_usage: float memory_usage: float memory_available: float disk_usage: float network_bytes_sent: int network_bytes_recv: int active_connections: int timestamp: datetime class MetricsCollector: """ Collects and manages performance metrics from all system components. Provides Prometheus-compatible metrics and real-time monitoring data. """ def __init__(self, collection_interval: float = 1.0, max_history: int = 10000): """ Initialize metrics collector. Args: collection_interval: How often to collect system metrics (seconds) max_history: Maximum number of metric points to keep in memory """ self.collection_interval = collection_interval self.max_history = max_history # Metric storage self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_history)) self.counters: Dict[str, float] = defaultdict(float) self.gauges: Dict[str, float] = defaultdict(float) self.histograms: Dict[str, List[float]] = defaultdict(list) # System metrics self.system_metrics_history: deque = deque(maxlen=max_history) # Collection control self._collecting = False self._collection_thread: Optional[threading.Thread] = None self._lock = threading.RLock() # Callbacks for custom metrics self.metric_callbacks: List[Callable[[], Dict[str, float]]] = [] logger.info(f"Metrics collector initialized with {collection_interval}s interval") def start_collection(self) -> None: """Start automatic metrics collection""" if self._collecting: logger.warning("Metrics collection already running") return self._collecting = True self._collection_thread = threading.Thread( target=self._collection_loop, name="MetricsCollector", daemon=True ) self._collection_thread.start() logger.info("Started metrics collection") def stop_collection(self) -> None: """Stop automatic metrics collection""" if not self._collecting: return self._collecting = False if self._collection_thread: self._collection_thread.join(timeout=5.0) logger.info("Stopped metrics collection") def _collection_loop(self) -> None: """Main collection loop""" while self._collecting: try: # Collect system metrics self._collect_system_metrics() # Collect custom metrics from callbacks self._collect_custom_metrics() time.sleep(self.collection_interval) except Exception as e: logger.error(f"Error in metrics collection loop: {e}") time.sleep(self.collection_interval) def _collect_system_metrics(self) -> None: """Collect system-level metrics""" try: # CPU usage cpu_percent = psutil.cpu_percent(interval=None) # Memory usage memory = psutil.virtual_memory() # Disk usage (root partition) disk = psutil.disk_usage('/') # Network stats network = psutil.net_io_counters() # Active connections connections = len(psutil.net_connections()) # Create system metrics object sys_metrics = SystemMetrics( cpu_usage=cpu_percent, memory_usage=memory.percent, memory_available=memory.available / (1024**3), # GB disk_usage=disk.percent, network_bytes_sent=network.bytes_sent, network_bytes_recv=network.bytes_recv, active_connections=connections, timestamp=get_current_timestamp() ) with self._lock: self.system_metrics_history.append(sys_metrics) # Update gauges self.gauges['system_cpu_usage'] = cpu_percent self.gauges['system_memory_usage'] = memory.percent self.gauges['system_memory_available_gb'] = memory.available / (1024**3) self.gauges['system_disk_usage'] = disk.percent self.gauges['system_active_connections'] = connections # Update counters (cumulative) self.counters['system_network_bytes_sent'] = network.bytes_sent self.counters['system_network_bytes_recv'] = network.bytes_recv except Exception as e: logger.error(f"Error collecting system metrics: {e}") def _collect_custom_metrics(self) -> None: """Collect metrics from registered callbacks""" for callback in self.metric_callbacks: try: custom_metrics = callback() if isinstance(custom_metrics, dict): with self._lock: for name, value in custom_metrics.items(): self.record_gauge(name, value) except Exception as e: logger.error(f"Error collecting custom metrics: {e}") def record_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None) -> None: """Record a counter metric (cumulative)""" with self._lock: self.counters[name] += value # Store metric point point = MetricPoint( name=name, value=self.counters[name], timestamp=get_current_timestamp(), labels=labels or {} ) self.metrics[name].append(point) def record_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None: """Record a gauge metric (current value)""" with self._lock: self.gauges[name] = value # Store metric point point = MetricPoint( name=name, value=value, timestamp=get_current_timestamp(), labels=labels or {} ) self.metrics[name].append(point) def record_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None: """Record a histogram metric (for latency, sizes, etc.)""" with self._lock: self.histograms[name].append(value) # Keep only recent values if len(self.histograms[name]) > 1000: self.histograms[name] = self.histograms[name][-1000:] # Store metric point point = MetricPoint( name=name, value=value, timestamp=get_current_timestamp(), labels=labels or {} ) self.metrics[name].append(point) def increment_counter(self, name: str, labels: Dict[str, str] = None) -> None: """Increment a counter by 1""" self.record_counter(name, 1.0, labels) def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None: """Set a gauge value""" self.record_gauge(name, value, labels) def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None: """Observe a value in a histogram""" self.record_histogram(name, value, labels) def get_current_metrics(self) -> Dict[str, Any]: """Get current metric values""" with self._lock: return { 'counters': dict(self.counters), 'gauges': dict(self.gauges), 'histograms': { name: { 'count': len(values), 'sum': sum(values), 'avg': sum(values) / len(values) if values else 0, 'min': min(values) if values else 0, 'max': max(values) if values else 0, 'p50': self._percentile(values, 50) if values else 0, 'p95': self._percentile(values, 95) if values else 0, 'p99': self._percentile(values, 99) if values else 0 } for name, values in self.histograms.items() }, 'system': self.get_latest_system_metrics() } def get_latest_system_metrics(self) -> Optional[Dict[str, Any]]: """Get the latest system metrics""" with self._lock: if not self.system_metrics_history: return None latest = self.system_metrics_history[-1] return { 'cpu_usage': latest.cpu_usage, 'memory_usage': latest.memory_usage, 'memory_available_gb': latest.memory_available, 'disk_usage': latest.disk_usage, 'network_bytes_sent': latest.network_bytes_sent, 'network_bytes_recv': latest.network_bytes_recv, 'active_connections': latest.active_connections, 'timestamp': latest.timestamp.isoformat() } def get_metric_history(self, name: str, limit: int = 100) -> List[Dict[str, Any]]: """Get historical values for a specific metric""" with self._lock: if name not in self.metrics: return [] points = list(self.metrics[name])[-limit:] return [ { 'value': point.value, 'timestamp': point.timestamp.isoformat(), 'labels': point.labels } for point in points ] def get_prometheus_metrics(self) -> str: """Export metrics in Prometheus format""" lines = [] with self._lock: # Export counters for name, value in self.counters.items(): lines.append(f"# TYPE {name} counter") lines.append(f"{name} {value}") # Export gauges for name, value in self.gauges.items(): lines.append(f"# TYPE {name} gauge") lines.append(f"{name} {value}") # Export histograms for name, values in self.histograms.items(): if values: lines.append(f"# TYPE {name} histogram") lines.append(f"{name}_count {len(values)}") lines.append(f"{name}_sum {sum(values)}") # Add percentiles for percentile in [50, 95, 99]: p_value = self._percentile(values, percentile) lines.append(f"{name}_percentile{{quantile=\"0.{percentile:02d}\"}} {p_value}") return "\n".join(lines) def register_callback(self, callback: Callable[[], Dict[str, float]]) -> None: """Register a callback for custom metrics collection""" self.metric_callbacks.append(callback) logger.info(f"Registered metrics callback: {callback.__name__}") def get_performance_summary(self) -> Dict[str, Any]: """Get a performance summary""" current_metrics = self.get_current_metrics() # Calculate rates and trends summary = { 'timestamp': get_current_timestamp().isoformat(), 'system': current_metrics.get('system', {}), 'counters': current_metrics.get('counters', {}), 'gauges': current_metrics.get('gauges', {}), 'performance_indicators': {} } # Add performance indicators histograms = current_metrics.get('histograms', {}) for name, stats in histograms.items(): if 'latency' in name.lower(): summary['performance_indicators'][f"{name}_avg_ms"] = stats['avg'] summary['performance_indicators'][f"{name}_p95_ms"] = stats['p95'] return summary def _percentile(self, values: List[float], percentile: int) -> float: """Calculate percentile of values""" if not values: return 0.0 sorted_values = sorted(values) index = int((percentile / 100.0) * len(sorted_values)) index = min(index, len(sorted_values) - 1) return sorted_values[index] def reset_metrics(self) -> None: """Reset all metrics (useful for testing)""" with self._lock: self.metrics.clear() self.counters.clear() self.gauges.clear() self.histograms.clear() self.system_metrics_history.clear() logger.info("All metrics reset") def get_stats(self) -> Dict[str, Any]: """Get collector statistics""" with self._lock: return { 'collecting': self._collecting, 'collection_interval': self.collection_interval, 'max_history': self.max_history, 'total_metrics': len(self.metrics), 'total_counters': len(self.counters), 'total_gauges': len(self.gauges), 'total_histograms': len(self.histograms), 'system_metrics_count': len(self.system_metrics_history), 'registered_callbacks': len(self.metric_callbacks) } # Global metrics collector instance metrics_collector = MetricsCollector()