395 lines
15 KiB
Python
395 lines
15 KiB
Python
"""
|
|
Comprehensive metrics collection for all system components.
|
|
"""
|
|
|
|
import time
|
|
import psutil
|
|
import threading
|
|
from typing import Dict, List, Optional, Any, Callable
|
|
from collections import defaultdict, deque
|
|
from datetime import datetime, timezone
|
|
from dataclasses import dataclass, field
|
|
|
|
from ..utils.logging import get_logger
|
|
from ..utils.timing import get_current_timestamp
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class MetricPoint:
|
|
"""Individual metric data point"""
|
|
name: str
|
|
value: float
|
|
timestamp: datetime
|
|
labels: Dict[str, str] = field(default_factory=dict)
|
|
|
|
def to_prometheus_format(self) -> str:
|
|
"""Convert to Prometheus format"""
|
|
labels_str = ""
|
|
if self.labels:
|
|
label_pairs = [f'{k}="{v}"' for k, v in self.labels.items()]
|
|
labels_str = "{" + ",".join(label_pairs) + "}"
|
|
|
|
return f"{self.name}{labels_str} {self.value} {int(self.timestamp.timestamp() * 1000)}"
|
|
|
|
|
|
@dataclass
|
|
class SystemMetrics:
|
|
"""System-level metrics"""
|
|
cpu_usage: float
|
|
memory_usage: float
|
|
memory_available: float
|
|
disk_usage: float
|
|
network_bytes_sent: int
|
|
network_bytes_recv: int
|
|
active_connections: int
|
|
timestamp: datetime
|
|
|
|
|
|
class MetricsCollector:
|
|
"""
|
|
Collects and manages performance metrics from all system components.
|
|
|
|
Provides Prometheus-compatible metrics and real-time monitoring data.
|
|
"""
|
|
|
|
def __init__(self, collection_interval: float = 1.0, max_history: int = 10000):
|
|
"""
|
|
Initialize metrics collector.
|
|
|
|
Args:
|
|
collection_interval: How often to collect system metrics (seconds)
|
|
max_history: Maximum number of metric points to keep in memory
|
|
"""
|
|
self.collection_interval = collection_interval
|
|
self.max_history = max_history
|
|
|
|
# Metric storage
|
|
self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_history))
|
|
self.counters: Dict[str, float] = defaultdict(float)
|
|
self.gauges: Dict[str, float] = defaultdict(float)
|
|
self.histograms: Dict[str, List[float]] = defaultdict(list)
|
|
|
|
# System metrics
|
|
self.system_metrics_history: deque = deque(maxlen=max_history)
|
|
|
|
# Collection control
|
|
self._collecting = False
|
|
self._collection_thread: Optional[threading.Thread] = None
|
|
self._lock = threading.RLock()
|
|
|
|
# Callbacks for custom metrics
|
|
self.metric_callbacks: List[Callable[[], Dict[str, float]]] = []
|
|
|
|
logger.info(f"Metrics collector initialized with {collection_interval}s interval")
|
|
|
|
def start_collection(self) -> None:
|
|
"""Start automatic metrics collection"""
|
|
if self._collecting:
|
|
logger.warning("Metrics collection already running")
|
|
return
|
|
|
|
self._collecting = True
|
|
self._collection_thread = threading.Thread(
|
|
target=self._collection_loop,
|
|
name="MetricsCollector",
|
|
daemon=True
|
|
)
|
|
self._collection_thread.start()
|
|
logger.info("Started metrics collection")
|
|
|
|
def stop_collection(self) -> None:
|
|
"""Stop automatic metrics collection"""
|
|
if not self._collecting:
|
|
return
|
|
|
|
self._collecting = False
|
|
if self._collection_thread:
|
|
self._collection_thread.join(timeout=5.0)
|
|
logger.info("Stopped metrics collection")
|
|
|
|
def _collection_loop(self) -> None:
|
|
"""Main collection loop"""
|
|
while self._collecting:
|
|
try:
|
|
# Collect system metrics
|
|
self._collect_system_metrics()
|
|
|
|
# Collect custom metrics from callbacks
|
|
self._collect_custom_metrics()
|
|
|
|
time.sleep(self.collection_interval)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in metrics collection loop: {e}")
|
|
time.sleep(self.collection_interval)
|
|
|
|
def _collect_system_metrics(self) -> None:
|
|
"""Collect system-level metrics"""
|
|
try:
|
|
# CPU usage
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
|
|
# Memory usage
|
|
memory = psutil.virtual_memory()
|
|
|
|
# Disk usage (root partition)
|
|
disk = psutil.disk_usage('/')
|
|
|
|
# Network stats
|
|
network = psutil.net_io_counters()
|
|
|
|
# Active connections
|
|
connections = len(psutil.net_connections())
|
|
|
|
# Create system metrics object
|
|
sys_metrics = SystemMetrics(
|
|
cpu_usage=cpu_percent,
|
|
memory_usage=memory.percent,
|
|
memory_available=memory.available / (1024**3), # GB
|
|
disk_usage=disk.percent,
|
|
network_bytes_sent=network.bytes_sent,
|
|
network_bytes_recv=network.bytes_recv,
|
|
active_connections=connections,
|
|
timestamp=get_current_timestamp()
|
|
)
|
|
|
|
with self._lock:
|
|
self.system_metrics_history.append(sys_metrics)
|
|
|
|
# Update gauges
|
|
self.gauges['system_cpu_usage'] = cpu_percent
|
|
self.gauges['system_memory_usage'] = memory.percent
|
|
self.gauges['system_memory_available_gb'] = memory.available / (1024**3)
|
|
self.gauges['system_disk_usage'] = disk.percent
|
|
self.gauges['system_active_connections'] = connections
|
|
|
|
# Update counters (cumulative)
|
|
self.counters['system_network_bytes_sent'] = network.bytes_sent
|
|
self.counters['system_network_bytes_recv'] = network.bytes_recv
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting system metrics: {e}")
|
|
|
|
def _collect_custom_metrics(self) -> None:
|
|
"""Collect metrics from registered callbacks"""
|
|
for callback in self.metric_callbacks:
|
|
try:
|
|
custom_metrics = callback()
|
|
if isinstance(custom_metrics, dict):
|
|
with self._lock:
|
|
for name, value in custom_metrics.items():
|
|
self.record_gauge(name, value)
|
|
except Exception as e:
|
|
logger.error(f"Error collecting custom metrics: {e}")
|
|
|
|
def record_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None) -> None:
|
|
"""Record a counter metric (cumulative)"""
|
|
with self._lock:
|
|
self.counters[name] += value
|
|
|
|
# Store metric point
|
|
point = MetricPoint(
|
|
name=name,
|
|
value=self.counters[name],
|
|
timestamp=get_current_timestamp(),
|
|
labels=labels or {}
|
|
)
|
|
self.metrics[name].append(point)
|
|
|
|
def record_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
|
"""Record a gauge metric (current value)"""
|
|
with self._lock:
|
|
self.gauges[name] = value
|
|
|
|
# Store metric point
|
|
point = MetricPoint(
|
|
name=name,
|
|
value=value,
|
|
timestamp=get_current_timestamp(),
|
|
labels=labels or {}
|
|
)
|
|
self.metrics[name].append(point)
|
|
|
|
def record_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
|
"""Record a histogram metric (for latency, sizes, etc.)"""
|
|
with self._lock:
|
|
self.histograms[name].append(value)
|
|
|
|
# Keep only recent values
|
|
if len(self.histograms[name]) > 1000:
|
|
self.histograms[name] = self.histograms[name][-1000:]
|
|
|
|
# Store metric point
|
|
point = MetricPoint(
|
|
name=name,
|
|
value=value,
|
|
timestamp=get_current_timestamp(),
|
|
labels=labels or {}
|
|
)
|
|
self.metrics[name].append(point)
|
|
|
|
def increment_counter(self, name: str, labels: Dict[str, str] = None) -> None:
|
|
"""Increment a counter by 1"""
|
|
self.record_counter(name, 1.0, labels)
|
|
|
|
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
|
"""Set a gauge value"""
|
|
self.record_gauge(name, value, labels)
|
|
|
|
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
|
"""Observe a value in a histogram"""
|
|
self.record_histogram(name, value, labels)
|
|
|
|
def get_current_metrics(self) -> Dict[str, Any]:
|
|
"""Get current metric values"""
|
|
with self._lock:
|
|
return {
|
|
'counters': dict(self.counters),
|
|
'gauges': dict(self.gauges),
|
|
'histograms': {
|
|
name: {
|
|
'count': len(values),
|
|
'sum': sum(values),
|
|
'avg': sum(values) / len(values) if values else 0,
|
|
'min': min(values) if values else 0,
|
|
'max': max(values) if values else 0,
|
|
'p50': self._percentile(values, 50) if values else 0,
|
|
'p95': self._percentile(values, 95) if values else 0,
|
|
'p99': self._percentile(values, 99) if values else 0
|
|
}
|
|
for name, values in self.histograms.items()
|
|
},
|
|
'system': self.get_latest_system_metrics()
|
|
}
|
|
|
|
def get_latest_system_metrics(self) -> Optional[Dict[str, Any]]:
|
|
"""Get the latest system metrics"""
|
|
with self._lock:
|
|
if not self.system_metrics_history:
|
|
return None
|
|
|
|
latest = self.system_metrics_history[-1]
|
|
return {
|
|
'cpu_usage': latest.cpu_usage,
|
|
'memory_usage': latest.memory_usage,
|
|
'memory_available_gb': latest.memory_available,
|
|
'disk_usage': latest.disk_usage,
|
|
'network_bytes_sent': latest.network_bytes_sent,
|
|
'network_bytes_recv': latest.network_bytes_recv,
|
|
'active_connections': latest.active_connections,
|
|
'timestamp': latest.timestamp.isoformat()
|
|
}
|
|
|
|
def get_metric_history(self, name: str, limit: int = 100) -> List[Dict[str, Any]]:
|
|
"""Get historical values for a specific metric"""
|
|
with self._lock:
|
|
if name not in self.metrics:
|
|
return []
|
|
|
|
points = list(self.metrics[name])[-limit:]
|
|
return [
|
|
{
|
|
'value': point.value,
|
|
'timestamp': point.timestamp.isoformat(),
|
|
'labels': point.labels
|
|
}
|
|
for point in points
|
|
]
|
|
|
|
def get_prometheus_metrics(self) -> str:
|
|
"""Export metrics in Prometheus format"""
|
|
lines = []
|
|
|
|
with self._lock:
|
|
# Export counters
|
|
for name, value in self.counters.items():
|
|
lines.append(f"# TYPE {name} counter")
|
|
lines.append(f"{name} {value}")
|
|
|
|
# Export gauges
|
|
for name, value in self.gauges.items():
|
|
lines.append(f"# TYPE {name} gauge")
|
|
lines.append(f"{name} {value}")
|
|
|
|
# Export histograms
|
|
for name, values in self.histograms.items():
|
|
if values:
|
|
lines.append(f"# TYPE {name} histogram")
|
|
lines.append(f"{name}_count {len(values)}")
|
|
lines.append(f"{name}_sum {sum(values)}")
|
|
|
|
# Add percentiles
|
|
for percentile in [50, 95, 99]:
|
|
p_value = self._percentile(values, percentile)
|
|
lines.append(f"{name}_percentile{{quantile=\"0.{percentile:02d}\"}} {p_value}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def register_callback(self, callback: Callable[[], Dict[str, float]]) -> None:
|
|
"""Register a callback for custom metrics collection"""
|
|
self.metric_callbacks.append(callback)
|
|
logger.info(f"Registered metrics callback: {callback.__name__}")
|
|
|
|
def get_performance_summary(self) -> Dict[str, Any]:
|
|
"""Get a performance summary"""
|
|
current_metrics = self.get_current_metrics()
|
|
|
|
# Calculate rates and trends
|
|
summary = {
|
|
'timestamp': get_current_timestamp().isoformat(),
|
|
'system': current_metrics.get('system', {}),
|
|
'counters': current_metrics.get('counters', {}),
|
|
'gauges': current_metrics.get('gauges', {}),
|
|
'performance_indicators': {}
|
|
}
|
|
|
|
# Add performance indicators
|
|
histograms = current_metrics.get('histograms', {})
|
|
for name, stats in histograms.items():
|
|
if 'latency' in name.lower():
|
|
summary['performance_indicators'][f"{name}_avg_ms"] = stats['avg']
|
|
summary['performance_indicators'][f"{name}_p95_ms"] = stats['p95']
|
|
|
|
return summary
|
|
|
|
def _percentile(self, values: List[float], percentile: int) -> float:
|
|
"""Calculate percentile of values"""
|
|
if not values:
|
|
return 0.0
|
|
|
|
sorted_values = sorted(values)
|
|
index = int((percentile / 100.0) * len(sorted_values))
|
|
index = min(index, len(sorted_values) - 1)
|
|
return sorted_values[index]
|
|
|
|
def reset_metrics(self) -> None:
|
|
"""Reset all metrics (useful for testing)"""
|
|
with self._lock:
|
|
self.metrics.clear()
|
|
self.counters.clear()
|
|
self.gauges.clear()
|
|
self.histograms.clear()
|
|
self.system_metrics_history.clear()
|
|
|
|
logger.info("All metrics reset")
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get collector statistics"""
|
|
with self._lock:
|
|
return {
|
|
'collecting': self._collecting,
|
|
'collection_interval': self.collection_interval,
|
|
'max_history': self.max_history,
|
|
'total_metrics': len(self.metrics),
|
|
'total_counters': len(self.counters),
|
|
'total_gauges': len(self.gauges),
|
|
'total_histograms': len(self.histograms),
|
|
'system_metrics_count': len(self.system_metrics_history),
|
|
'registered_callbacks': len(self.metric_callbacks)
|
|
}
|
|
|
|
|
|
# Global metrics collector instance
|
|
metrics_collector = MetricsCollector() |