Files
gogo2/COBY/monitoring/metrics_collector.py
Dobromir Popov fa972ace8a 16. monitoring
2025-08-05 01:06:26 +03:00

395 lines
15 KiB
Python

"""
Comprehensive metrics collection for all system components.
"""
import time
import psutil
import threading
from typing import Dict, List, Optional, Any, Callable
from collections import defaultdict, deque
from datetime import datetime, timezone
from dataclasses import dataclass, field
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
@dataclass
class MetricPoint:
"""Individual metric data point"""
name: str
value: float
timestamp: datetime
labels: Dict[str, str] = field(default_factory=dict)
def to_prometheus_format(self) -> str:
"""Convert to Prometheus format"""
labels_str = ""
if self.labels:
label_pairs = [f'{k}="{v}"' for k, v in self.labels.items()]
labels_str = "{" + ",".join(label_pairs) + "}"
return f"{self.name}{labels_str} {self.value} {int(self.timestamp.timestamp() * 1000)}"
@dataclass
class SystemMetrics:
"""System-level metrics"""
cpu_usage: float
memory_usage: float
memory_available: float
disk_usage: float
network_bytes_sent: int
network_bytes_recv: int
active_connections: int
timestamp: datetime
class MetricsCollector:
"""
Collects and manages performance metrics from all system components.
Provides Prometheus-compatible metrics and real-time monitoring data.
"""
def __init__(self, collection_interval: float = 1.0, max_history: int = 10000):
"""
Initialize metrics collector.
Args:
collection_interval: How often to collect system metrics (seconds)
max_history: Maximum number of metric points to keep in memory
"""
self.collection_interval = collection_interval
self.max_history = max_history
# Metric storage
self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_history))
self.counters: Dict[str, float] = defaultdict(float)
self.gauges: Dict[str, float] = defaultdict(float)
self.histograms: Dict[str, List[float]] = defaultdict(list)
# System metrics
self.system_metrics_history: deque = deque(maxlen=max_history)
# Collection control
self._collecting = False
self._collection_thread: Optional[threading.Thread] = None
self._lock = threading.RLock()
# Callbacks for custom metrics
self.metric_callbacks: List[Callable[[], Dict[str, float]]] = []
logger.info(f"Metrics collector initialized with {collection_interval}s interval")
def start_collection(self) -> None:
"""Start automatic metrics collection"""
if self._collecting:
logger.warning("Metrics collection already running")
return
self._collecting = True
self._collection_thread = threading.Thread(
target=self._collection_loop,
name="MetricsCollector",
daemon=True
)
self._collection_thread.start()
logger.info("Started metrics collection")
def stop_collection(self) -> None:
"""Stop automatic metrics collection"""
if not self._collecting:
return
self._collecting = False
if self._collection_thread:
self._collection_thread.join(timeout=5.0)
logger.info("Stopped metrics collection")
def _collection_loop(self) -> None:
"""Main collection loop"""
while self._collecting:
try:
# Collect system metrics
self._collect_system_metrics()
# Collect custom metrics from callbacks
self._collect_custom_metrics()
time.sleep(self.collection_interval)
except Exception as e:
logger.error(f"Error in metrics collection loop: {e}")
time.sleep(self.collection_interval)
def _collect_system_metrics(self) -> None:
"""Collect system-level metrics"""
try:
# CPU usage
cpu_percent = psutil.cpu_percent(interval=None)
# Memory usage
memory = psutil.virtual_memory()
# Disk usage (root partition)
disk = psutil.disk_usage('/')
# Network stats
network = psutil.net_io_counters()
# Active connections
connections = len(psutil.net_connections())
# Create system metrics object
sys_metrics = SystemMetrics(
cpu_usage=cpu_percent,
memory_usage=memory.percent,
memory_available=memory.available / (1024**3), # GB
disk_usage=disk.percent,
network_bytes_sent=network.bytes_sent,
network_bytes_recv=network.bytes_recv,
active_connections=connections,
timestamp=get_current_timestamp()
)
with self._lock:
self.system_metrics_history.append(sys_metrics)
# Update gauges
self.gauges['system_cpu_usage'] = cpu_percent
self.gauges['system_memory_usage'] = memory.percent
self.gauges['system_memory_available_gb'] = memory.available / (1024**3)
self.gauges['system_disk_usage'] = disk.percent
self.gauges['system_active_connections'] = connections
# Update counters (cumulative)
self.counters['system_network_bytes_sent'] = network.bytes_sent
self.counters['system_network_bytes_recv'] = network.bytes_recv
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
def _collect_custom_metrics(self) -> None:
"""Collect metrics from registered callbacks"""
for callback in self.metric_callbacks:
try:
custom_metrics = callback()
if isinstance(custom_metrics, dict):
with self._lock:
for name, value in custom_metrics.items():
self.record_gauge(name, value)
except Exception as e:
logger.error(f"Error collecting custom metrics: {e}")
def record_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None) -> None:
"""Record a counter metric (cumulative)"""
with self._lock:
self.counters[name] += value
# Store metric point
point = MetricPoint(
name=name,
value=self.counters[name],
timestamp=get_current_timestamp(),
labels=labels or {}
)
self.metrics[name].append(point)
def record_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
"""Record a gauge metric (current value)"""
with self._lock:
self.gauges[name] = value
# Store metric point
point = MetricPoint(
name=name,
value=value,
timestamp=get_current_timestamp(),
labels=labels or {}
)
self.metrics[name].append(point)
def record_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
"""Record a histogram metric (for latency, sizes, etc.)"""
with self._lock:
self.histograms[name].append(value)
# Keep only recent values
if len(self.histograms[name]) > 1000:
self.histograms[name] = self.histograms[name][-1000:]
# Store metric point
point = MetricPoint(
name=name,
value=value,
timestamp=get_current_timestamp(),
labels=labels or {}
)
self.metrics[name].append(point)
def increment_counter(self, name: str, labels: Dict[str, str] = None) -> None:
"""Increment a counter by 1"""
self.record_counter(name, 1.0, labels)
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
"""Set a gauge value"""
self.record_gauge(name, value, labels)
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
"""Observe a value in a histogram"""
self.record_histogram(name, value, labels)
def get_current_metrics(self) -> Dict[str, Any]:
"""Get current metric values"""
with self._lock:
return {
'counters': dict(self.counters),
'gauges': dict(self.gauges),
'histograms': {
name: {
'count': len(values),
'sum': sum(values),
'avg': sum(values) / len(values) if values else 0,
'min': min(values) if values else 0,
'max': max(values) if values else 0,
'p50': self._percentile(values, 50) if values else 0,
'p95': self._percentile(values, 95) if values else 0,
'p99': self._percentile(values, 99) if values else 0
}
for name, values in self.histograms.items()
},
'system': self.get_latest_system_metrics()
}
def get_latest_system_metrics(self) -> Optional[Dict[str, Any]]:
"""Get the latest system metrics"""
with self._lock:
if not self.system_metrics_history:
return None
latest = self.system_metrics_history[-1]
return {
'cpu_usage': latest.cpu_usage,
'memory_usage': latest.memory_usage,
'memory_available_gb': latest.memory_available,
'disk_usage': latest.disk_usage,
'network_bytes_sent': latest.network_bytes_sent,
'network_bytes_recv': latest.network_bytes_recv,
'active_connections': latest.active_connections,
'timestamp': latest.timestamp.isoformat()
}
def get_metric_history(self, name: str, limit: int = 100) -> List[Dict[str, Any]]:
"""Get historical values for a specific metric"""
with self._lock:
if name not in self.metrics:
return []
points = list(self.metrics[name])[-limit:]
return [
{
'value': point.value,
'timestamp': point.timestamp.isoformat(),
'labels': point.labels
}
for point in points
]
def get_prometheus_metrics(self) -> str:
"""Export metrics in Prometheus format"""
lines = []
with self._lock:
# Export counters
for name, value in self.counters.items():
lines.append(f"# TYPE {name} counter")
lines.append(f"{name} {value}")
# Export gauges
for name, value in self.gauges.items():
lines.append(f"# TYPE {name} gauge")
lines.append(f"{name} {value}")
# Export histograms
for name, values in self.histograms.items():
if values:
lines.append(f"# TYPE {name} histogram")
lines.append(f"{name}_count {len(values)}")
lines.append(f"{name}_sum {sum(values)}")
# Add percentiles
for percentile in [50, 95, 99]:
p_value = self._percentile(values, percentile)
lines.append(f"{name}_percentile{{quantile=\"0.{percentile:02d}\"}} {p_value}")
return "\n".join(lines)
def register_callback(self, callback: Callable[[], Dict[str, float]]) -> None:
"""Register a callback for custom metrics collection"""
self.metric_callbacks.append(callback)
logger.info(f"Registered metrics callback: {callback.__name__}")
def get_performance_summary(self) -> Dict[str, Any]:
"""Get a performance summary"""
current_metrics = self.get_current_metrics()
# Calculate rates and trends
summary = {
'timestamp': get_current_timestamp().isoformat(),
'system': current_metrics.get('system', {}),
'counters': current_metrics.get('counters', {}),
'gauges': current_metrics.get('gauges', {}),
'performance_indicators': {}
}
# Add performance indicators
histograms = current_metrics.get('histograms', {})
for name, stats in histograms.items():
if 'latency' in name.lower():
summary['performance_indicators'][f"{name}_avg_ms"] = stats['avg']
summary['performance_indicators'][f"{name}_p95_ms"] = stats['p95']
return summary
def _percentile(self, values: List[float], percentile: int) -> float:
"""Calculate percentile of values"""
if not values:
return 0.0
sorted_values = sorted(values)
index = int((percentile / 100.0) * len(sorted_values))
index = min(index, len(sorted_values) - 1)
return sorted_values[index]
def reset_metrics(self) -> None:
"""Reset all metrics (useful for testing)"""
with self._lock:
self.metrics.clear()
self.counters.clear()
self.gauges.clear()
self.histograms.clear()
self.system_metrics_history.clear()
logger.info("All metrics reset")
def get_stats(self) -> Dict[str, Any]:
"""Get collector statistics"""
with self._lock:
return {
'collecting': self._collecting,
'collection_interval': self.collection_interval,
'max_history': self.max_history,
'total_metrics': len(self.metrics),
'total_counters': len(self.counters),
'total_gauges': len(self.gauges),
'total_histograms': len(self.histograms),
'system_metrics_count': len(self.system_metrics_history),
'registered_callbacks': len(self.metric_callbacks)
}
# Global metrics collector instance
metrics_collector = MetricsCollector()