16. monitoring
This commit is contained in:
561
COBY/monitoring/memory_monitor.py
Normal file
561
COBY/monitoring/memory_monitor.py
Normal file
@ -0,0 +1,561 @@
|
||||
"""
|
||||
Memory usage monitoring and garbage collection optimization.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import sys
|
||||
import threading
|
||||
import tracemalloc
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from collections import defaultdict, deque
|
||||
from datetime import datetime, timezone
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..utils.logging import get_logger
|
||||
from ..utils.timing import get_current_timestamp
|
||||
# Import will be done lazily to avoid circular imports
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemorySnapshot:
|
||||
"""Memory usage snapshot"""
|
||||
timestamp: datetime
|
||||
total_memory_mb: float
|
||||
available_memory_mb: float
|
||||
process_memory_mb: float
|
||||
gc_collections: Dict[int, int]
|
||||
gc_objects: int
|
||||
tracemalloc_current_mb: Optional[float] = None
|
||||
tracemalloc_peak_mb: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryLeak:
|
||||
"""Memory leak detection result"""
|
||||
object_type: str
|
||||
count_increase: int
|
||||
size_increase_mb: float
|
||||
growth_rate_per_hour: float
|
||||
severity: str # 'low', 'medium', 'high'
|
||||
|
||||
|
||||
class MemoryMonitor:
|
||||
"""
|
||||
Monitors memory usage, detects leaks, and optimizes garbage collection.
|
||||
|
||||
Provides detailed memory analytics and automatic GC optimization.
|
||||
"""
|
||||
|
||||
def __init__(self, enable_tracemalloc: bool = True, snapshot_interval: float = 30.0):
|
||||
"""
|
||||
Initialize memory monitor.
|
||||
|
||||
Args:
|
||||
enable_tracemalloc: Whether to enable detailed memory tracing
|
||||
snapshot_interval: How often to take memory snapshots (seconds)
|
||||
"""
|
||||
self.enable_tracemalloc = enable_tracemalloc
|
||||
self.snapshot_interval = snapshot_interval
|
||||
|
||||
# Memory tracking
|
||||
self.memory_snapshots: deque = deque(maxlen=1000)
|
||||
self.object_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
|
||||
|
||||
# GC optimization
|
||||
self.gc_stats: Dict[str, Any] = {}
|
||||
self.gc_thresholds = gc.get_threshold()
|
||||
self.auto_gc_enabled = True
|
||||
|
||||
# Leak detection
|
||||
self.leak_detection_enabled = True
|
||||
self.detected_leaks: List[MemoryLeak] = []
|
||||
|
||||
# Monitoring control
|
||||
self._monitoring = False
|
||||
self._monitor_thread: Optional[threading.Thread] = None
|
||||
|
||||
# Initialize tracemalloc if enabled
|
||||
if self.enable_tracemalloc and not tracemalloc.is_tracing():
|
||||
tracemalloc.start()
|
||||
logger.info("Started tracemalloc for detailed memory tracking")
|
||||
|
||||
logger.info(f"Memory monitor initialized (tracemalloc: {self.enable_tracemalloc})")
|
||||
|
||||
def start_monitoring(self) -> None:
|
||||
"""Start memory monitoring"""
|
||||
if self._monitoring:
|
||||
logger.warning("Memory monitoring already running")
|
||||
return
|
||||
|
||||
self._monitoring = True
|
||||
self._monitor_thread = threading.Thread(
|
||||
target=self._monitoring_loop,
|
||||
name="MemoryMonitor",
|
||||
daemon=True
|
||||
)
|
||||
self._monitor_thread.start()
|
||||
logger.info("Started memory monitoring")
|
||||
|
||||
def stop_monitoring(self) -> None:
|
||||
"""Stop memory monitoring"""
|
||||
if not self._monitoring:
|
||||
return
|
||||
|
||||
self._monitoring = False
|
||||
if self._monitor_thread:
|
||||
self._monitor_thread.join(timeout=5.0)
|
||||
logger.info("Stopped memory monitoring")
|
||||
|
||||
def _monitoring_loop(self) -> None:
|
||||
"""Main monitoring loop"""
|
||||
import time
|
||||
|
||||
while self._monitoring:
|
||||
try:
|
||||
# Take memory snapshot
|
||||
self._take_memory_snapshot()
|
||||
|
||||
# Update object counts
|
||||
self._update_object_counts()
|
||||
|
||||
# Check for memory leaks
|
||||
if self.leak_detection_enabled:
|
||||
self._check_for_leaks()
|
||||
|
||||
# Optimize garbage collection
|
||||
if self.auto_gc_enabled:
|
||||
self._optimize_gc()
|
||||
|
||||
time.sleep(self.snapshot_interval)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in memory monitoring loop: {e}")
|
||||
time.sleep(self.snapshot_interval)
|
||||
|
||||
def _take_memory_snapshot(self) -> None:
|
||||
"""Take a memory usage snapshot"""
|
||||
try:
|
||||
import psutil
|
||||
|
||||
# Get system memory info
|
||||
memory = psutil.virtual_memory()
|
||||
|
||||
# Get process memory info
|
||||
process = psutil.Process()
|
||||
process_memory = process.memory_info()
|
||||
|
||||
# Get GC stats
|
||||
gc_collections = {i: gc.get_count()[i] for i in range(3)}
|
||||
gc_objects = len(gc.get_objects())
|
||||
|
||||
# Get tracemalloc stats if enabled
|
||||
tracemalloc_current_mb = None
|
||||
tracemalloc_peak_mb = None
|
||||
|
||||
if self.enable_tracemalloc and tracemalloc.is_tracing():
|
||||
current, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc_current_mb = current / (1024 * 1024)
|
||||
tracemalloc_peak_mb = peak / (1024 * 1024)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = MemorySnapshot(
|
||||
timestamp=get_current_timestamp(),
|
||||
total_memory_mb=memory.total / (1024 * 1024),
|
||||
available_memory_mb=memory.available / (1024 * 1024),
|
||||
process_memory_mb=process_memory.rss / (1024 * 1024),
|
||||
gc_collections=gc_collections,
|
||||
gc_objects=gc_objects,
|
||||
tracemalloc_current_mb=tracemalloc_current_mb,
|
||||
tracemalloc_peak_mb=tracemalloc_peak_mb
|
||||
)
|
||||
|
||||
self.memory_snapshots.append(snapshot)
|
||||
|
||||
# Update metrics
|
||||
try:
|
||||
from .metrics_collector import metrics_collector
|
||||
metrics_collector.set_gauge('memory_total_mb', snapshot.total_memory_mb)
|
||||
metrics_collector.set_gauge('memory_available_mb', snapshot.available_memory_mb)
|
||||
metrics_collector.set_gauge('memory_process_mb', snapshot.process_memory_mb)
|
||||
metrics_collector.set_gauge('memory_gc_objects', snapshot.gc_objects)
|
||||
|
||||
if tracemalloc_current_mb is not None:
|
||||
metrics_collector.set_gauge('memory_tracemalloc_current_mb', tracemalloc_current_mb)
|
||||
metrics_collector.set_gauge('memory_tracemalloc_peak_mb', tracemalloc_peak_mb)
|
||||
except ImportError:
|
||||
pass # Metrics collector not available
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error taking memory snapshot: {e}")
|
||||
|
||||
def _update_object_counts(self) -> None:
|
||||
"""Update object counts by type"""
|
||||
try:
|
||||
# Count objects by type
|
||||
object_counts = defaultdict(int)
|
||||
|
||||
for obj in gc.get_objects():
|
||||
obj_type = type(obj).__name__
|
||||
object_counts[obj_type] += 1
|
||||
|
||||
# Store counts with timestamp
|
||||
timestamp = get_current_timestamp()
|
||||
for obj_type, count in object_counts.items():
|
||||
self.object_counts[obj_type].append((timestamp, count))
|
||||
|
||||
# Update metrics for common types
|
||||
try:
|
||||
from .metrics_collector import metrics_collector
|
||||
common_types = ['dict', 'list', 'tuple', 'str', 'function', 'type']
|
||||
for obj_type in common_types:
|
||||
if obj_type in object_counts:
|
||||
metrics_collector.set_gauge(
|
||||
f'memory_objects_{obj_type}',
|
||||
object_counts[obj_type]
|
||||
)
|
||||
except ImportError:
|
||||
pass # Metrics collector not available
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating object counts: {e}")
|
||||
|
||||
def _check_for_leaks(self) -> None:
|
||||
"""Check for potential memory leaks"""
|
||||
try:
|
||||
if len(self.memory_snapshots) < 10:
|
||||
return # Need more data
|
||||
|
||||
# Check for consistent memory growth
|
||||
recent_snapshots = list(self.memory_snapshots)[-10:]
|
||||
memory_values = [s.process_memory_mb for s in recent_snapshots]
|
||||
|
||||
# Simple linear regression to detect growth trend
|
||||
if self._is_memory_growing(memory_values):
|
||||
# Check object count growth
|
||||
potential_leaks = self._analyze_object_growth()
|
||||
|
||||
for leak in potential_leaks:
|
||||
if leak not in self.detected_leaks:
|
||||
self.detected_leaks.append(leak)
|
||||
logger.warning(f"Potential memory leak detected: {leak.object_type}")
|
||||
|
||||
# Record leak detection
|
||||
try:
|
||||
from .metrics_collector import metrics_collector
|
||||
metrics_collector.increment_counter('memory_leaks_detected')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking for leaks: {e}")
|
||||
|
||||
def _is_memory_growing(self, memory_values: List[float], threshold: float = 5.0) -> bool:
|
||||
"""Check if memory is consistently growing"""
|
||||
if len(memory_values) < 5:
|
||||
return False
|
||||
|
||||
# Check if memory increased by more than threshold MB
|
||||
growth = memory_values[-1] - memory_values[0]
|
||||
return growth > threshold
|
||||
|
||||
def _analyze_object_growth(self) -> List[MemoryLeak]:
|
||||
"""Analyze object count growth to identify potential leaks"""
|
||||
leaks = []
|
||||
|
||||
for obj_type, counts in self.object_counts.items():
|
||||
if len(counts) < 10:
|
||||
continue
|
||||
|
||||
# Get recent counts
|
||||
recent_counts = list(counts)[-10:]
|
||||
timestamps = [item[0] for item in recent_counts]
|
||||
count_values = [item[1] for item in recent_counts]
|
||||
|
||||
# Check for growth
|
||||
if len(count_values) >= 2:
|
||||
growth = count_values[-1] - count_values[0]
|
||||
time_diff = (timestamps[-1] - timestamps[0]).total_seconds() / 3600 # hours
|
||||
|
||||
if growth > 100 and time_diff > 0: # More than 100 objects growth
|
||||
growth_rate = growth / time_diff
|
||||
|
||||
# Determine severity
|
||||
if growth_rate > 1000:
|
||||
severity = 'high'
|
||||
elif growth_rate > 100:
|
||||
severity = 'medium'
|
||||
else:
|
||||
severity = 'low'
|
||||
|
||||
leak = MemoryLeak(
|
||||
object_type=obj_type,
|
||||
count_increase=growth,
|
||||
size_increase_mb=growth * 0.001, # Rough estimate
|
||||
growth_rate_per_hour=growth_rate,
|
||||
severity=severity
|
||||
)
|
||||
leaks.append(leak)
|
||||
|
||||
return leaks
|
||||
|
||||
def _optimize_gc(self) -> None:
|
||||
"""Optimize garbage collection based on memory usage"""
|
||||
try:
|
||||
if not self.memory_snapshots:
|
||||
return
|
||||
|
||||
latest_snapshot = self.memory_snapshots[-1]
|
||||
memory_usage_percent = (
|
||||
(latest_snapshot.total_memory_mb - latest_snapshot.available_memory_mb) /
|
||||
latest_snapshot.total_memory_mb * 100
|
||||
)
|
||||
|
||||
# Adjust GC thresholds based on memory pressure
|
||||
if memory_usage_percent > 85:
|
||||
# High memory pressure - more aggressive GC
|
||||
new_thresholds = (500, 10, 10)
|
||||
if gc.get_threshold() != new_thresholds:
|
||||
gc.set_threshold(*new_thresholds)
|
||||
logger.info("Enabled aggressive garbage collection due to high memory usage")
|
||||
|
||||
# Force collection
|
||||
collected = gc.collect()
|
||||
metrics_collector.increment_counter('memory_gc_forced')
|
||||
logger.debug(f"Forced GC collected {collected} objects")
|
||||
|
||||
elif memory_usage_percent < 50:
|
||||
# Low memory pressure - less aggressive GC
|
||||
new_thresholds = (1000, 20, 20)
|
||||
if gc.get_threshold() != new_thresholds:
|
||||
gc.set_threshold(*new_thresholds)
|
||||
logger.info("Reduced garbage collection frequency due to low memory usage")
|
||||
|
||||
# Update GC stats
|
||||
self.gc_stats = {
|
||||
'threshold': gc.get_threshold(),
|
||||
'counts': gc.get_count(),
|
||||
'collections': gc.get_stats()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error optimizing GC: {e}")
|
||||
|
||||
def force_garbage_collection(self) -> Dict[str, int]:
|
||||
"""Force garbage collection and return statistics"""
|
||||
try:
|
||||
# Get counts before collection
|
||||
before_counts = gc.get_count()
|
||||
before_objects = len(gc.get_objects())
|
||||
|
||||
# Force collection for all generations
|
||||
collected = [gc.collect(generation) for generation in range(3)]
|
||||
total_collected = sum(collected)
|
||||
|
||||
# Get counts after collection
|
||||
after_counts = gc.get_count()
|
||||
after_objects = len(gc.get_objects())
|
||||
|
||||
# Update metrics
|
||||
try:
|
||||
from .metrics_collector import metrics_collector
|
||||
metrics_collector.increment_counter('memory_gc_manual')
|
||||
metrics_collector.set_gauge('memory_gc_objects_collected', total_collected)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
result = {
|
||||
'total_collected': total_collected,
|
||||
'by_generation': collected,
|
||||
'objects_before': before_objects,
|
||||
'objects_after': after_objects,
|
||||
'objects_freed': before_objects - after_objects,
|
||||
'counts_before': before_counts,
|
||||
'counts_after': after_counts
|
||||
}
|
||||
|
||||
logger.info(f"Manual GC collected {total_collected} objects, freed {result['objects_freed']} objects")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during forced garbage collection: {e}")
|
||||
return {}
|
||||
|
||||
def get_memory_usage_summary(self) -> Dict[str, Any]:
|
||||
"""Get current memory usage summary"""
|
||||
if not self.memory_snapshots:
|
||||
return {}
|
||||
|
||||
latest = self.memory_snapshots[-1]
|
||||
|
||||
# Calculate memory usage percentage
|
||||
memory_usage_percent = (
|
||||
(latest.total_memory_mb - latest.available_memory_mb) /
|
||||
latest.total_memory_mb * 100
|
||||
)
|
||||
|
||||
return {
|
||||
'timestamp': latest.timestamp.isoformat(),
|
||||
'total_memory_mb': latest.total_memory_mb,
|
||||
'available_memory_mb': latest.available_memory_mb,
|
||||
'used_memory_mb': latest.total_memory_mb - latest.available_memory_mb,
|
||||
'memory_usage_percent': memory_usage_percent,
|
||||
'process_memory_mb': latest.process_memory_mb,
|
||||
'gc_objects': latest.gc_objects,
|
||||
'gc_collections': latest.gc_collections,
|
||||
'tracemalloc_current_mb': latest.tracemalloc_current_mb,
|
||||
'tracemalloc_peak_mb': latest.tracemalloc_peak_mb
|
||||
}
|
||||
|
||||
def get_memory_trends(self, hours: int = 1) -> Dict[str, Any]:
|
||||
"""Get memory usage trends over specified time period"""
|
||||
if not self.memory_snapshots:
|
||||
return {}
|
||||
|
||||
from datetime import timedelta
|
||||
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
|
||||
|
||||
# Filter snapshots
|
||||
recent_snapshots = [
|
||||
s for s in self.memory_snapshots
|
||||
if s.timestamp >= cutoff_time
|
||||
]
|
||||
|
||||
if len(recent_snapshots) < 2:
|
||||
return {'trend': 'insufficient_data'}
|
||||
|
||||
# Calculate trends
|
||||
process_memory_values = [s.process_memory_mb for s in recent_snapshots]
|
||||
gc_object_values = [s.gc_objects for s in recent_snapshots]
|
||||
|
||||
return {
|
||||
'process_memory': {
|
||||
'start_mb': process_memory_values[0],
|
||||
'end_mb': process_memory_values[-1],
|
||||
'change_mb': process_memory_values[-1] - process_memory_values[0],
|
||||
'max_mb': max(process_memory_values),
|
||||
'min_mb': min(process_memory_values),
|
||||
'avg_mb': sum(process_memory_values) / len(process_memory_values)
|
||||
},
|
||||
'gc_objects': {
|
||||
'start_count': gc_object_values[0],
|
||||
'end_count': gc_object_values[-1],
|
||||
'change_count': gc_object_values[-1] - gc_object_values[0],
|
||||
'max_count': max(gc_object_values),
|
||||
'min_count': min(gc_object_values),
|
||||
'avg_count': sum(gc_object_values) / len(gc_object_values)
|
||||
},
|
||||
'sample_count': len(recent_snapshots),
|
||||
'time_period_hours': hours
|
||||
}
|
||||
|
||||
def get_top_memory_consumers(self, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Get top memory consuming object types"""
|
||||
if not self.object_counts:
|
||||
return []
|
||||
|
||||
# Get latest counts
|
||||
latest_counts = {}
|
||||
for obj_type, counts in self.object_counts.items():
|
||||
if counts:
|
||||
latest_counts[obj_type] = counts[-1][1] # Get count from (timestamp, count) tuple
|
||||
|
||||
# Sort by count
|
||||
sorted_types = sorted(
|
||||
latest_counts.items(),
|
||||
key=lambda x: x[1],
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
'object_type': obj_type,
|
||||
'count': count,
|
||||
'estimated_size_mb': count * 0.001 # Rough estimate
|
||||
}
|
||||
for obj_type, count in sorted_types[:limit]
|
||||
]
|
||||
|
||||
def get_detected_leaks(self) -> List[Dict[str, Any]]:
|
||||
"""Get detected memory leaks"""
|
||||
return [
|
||||
{
|
||||
'object_type': leak.object_type,
|
||||
'count_increase': leak.count_increase,
|
||||
'size_increase_mb': leak.size_increase_mb,
|
||||
'growth_rate_per_hour': leak.growth_rate_per_hour,
|
||||
'severity': leak.severity
|
||||
}
|
||||
for leak in self.detected_leaks
|
||||
]
|
||||
|
||||
def get_tracemalloc_top(self, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Get top memory allocations from tracemalloc"""
|
||||
if not self.enable_tracemalloc or not tracemalloc.is_tracing():
|
||||
return []
|
||||
|
||||
try:
|
||||
snapshot = tracemalloc.take_snapshot()
|
||||
top_stats = snapshot.statistics('lineno')
|
||||
|
||||
return [
|
||||
{
|
||||
'filename': stat.traceback.format()[0],
|
||||
'size_mb': stat.size / (1024 * 1024),
|
||||
'count': stat.count
|
||||
}
|
||||
for stat in top_stats[:limit]
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting tracemalloc top: {e}")
|
||||
return []
|
||||
|
||||
def clear_leak_history(self) -> None:
|
||||
"""Clear detected leak history"""
|
||||
self.detected_leaks.clear()
|
||||
logger.info("Cleared memory leak history")
|
||||
|
||||
def get_gc_stats(self) -> Dict[str, Any]:
|
||||
"""Get garbage collection statistics"""
|
||||
return {
|
||||
'thresholds': gc.get_threshold(),
|
||||
'counts': gc.get_count(),
|
||||
'stats': gc.get_stats(),
|
||||
'auto_gc_enabled': self.auto_gc_enabled,
|
||||
'is_enabled': gc.isenabled()
|
||||
}
|
||||
|
||||
def set_gc_thresholds(self, gen0: int, gen1: int, gen2: int) -> None:
|
||||
"""Set garbage collection thresholds"""
|
||||
gc.set_threshold(gen0, gen1, gen2)
|
||||
logger.info(f"Set GC thresholds to ({gen0}, {gen1}, {gen2})")
|
||||
|
||||
def enable_auto_gc_optimization(self, enabled: bool = True) -> None:
|
||||
"""Enable or disable automatic GC optimization"""
|
||||
self.auto_gc_enabled = enabled
|
||||
logger.info(f"Auto GC optimization {'enabled' if enabled else 'disabled'}")
|
||||
|
||||
def enable_leak_detection(self, enabled: bool = True) -> None:
|
||||
"""Enable or disable memory leak detection"""
|
||||
self.leak_detection_enabled = enabled
|
||||
logger.info(f"Memory leak detection {'enabled' if enabled else 'disabled'}")
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get memory monitor statistics"""
|
||||
return {
|
||||
'monitoring': self._monitoring,
|
||||
'snapshot_interval': self.snapshot_interval,
|
||||
'snapshots_count': len(self.memory_snapshots),
|
||||
'object_types_tracked': len(self.object_counts),
|
||||
'detected_leaks': len(self.detected_leaks),
|
||||
'tracemalloc_enabled': self.enable_tracemalloc and tracemalloc.is_tracing(),
|
||||
'auto_gc_enabled': self.auto_gc_enabled,
|
||||
'leak_detection_enabled': self.leak_detection_enabled,
|
||||
'gc_thresholds': gc.get_threshold()
|
||||
}
|
||||
|
||||
|
||||
# Global memory monitor instance
|
||||
memory_monitor = MemoryMonitor()
|
Reference in New Issue
Block a user