16. monitoring

This commit is contained in:
Dobromir Popov
2025-08-05 01:06:26 +03:00
parent 3e0d7d5a99
commit fa972ace8a
7 changed files with 2701 additions and 0 deletions

View File

@ -0,0 +1,561 @@
"""
Memory usage monitoring and garbage collection optimization.
"""
import gc
import sys
import threading
import tracemalloc
from typing import Dict, List, Optional, Any, Tuple
from collections import defaultdict, deque
from datetime import datetime, timezone
from dataclasses import dataclass
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
# Import will be done lazily to avoid circular imports
logger = get_logger(__name__)
@dataclass
class MemorySnapshot:
"""Memory usage snapshot"""
timestamp: datetime
total_memory_mb: float
available_memory_mb: float
process_memory_mb: float
gc_collections: Dict[int, int]
gc_objects: int
tracemalloc_current_mb: Optional[float] = None
tracemalloc_peak_mb: Optional[float] = None
@dataclass
class MemoryLeak:
"""Memory leak detection result"""
object_type: str
count_increase: int
size_increase_mb: float
growth_rate_per_hour: float
severity: str # 'low', 'medium', 'high'
class MemoryMonitor:
"""
Monitors memory usage, detects leaks, and optimizes garbage collection.
Provides detailed memory analytics and automatic GC optimization.
"""
def __init__(self, enable_tracemalloc: bool = True, snapshot_interval: float = 30.0):
"""
Initialize memory monitor.
Args:
enable_tracemalloc: Whether to enable detailed memory tracing
snapshot_interval: How often to take memory snapshots (seconds)
"""
self.enable_tracemalloc = enable_tracemalloc
self.snapshot_interval = snapshot_interval
# Memory tracking
self.memory_snapshots: deque = deque(maxlen=1000)
self.object_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
# GC optimization
self.gc_stats: Dict[str, Any] = {}
self.gc_thresholds = gc.get_threshold()
self.auto_gc_enabled = True
# Leak detection
self.leak_detection_enabled = True
self.detected_leaks: List[MemoryLeak] = []
# Monitoring control
self._monitoring = False
self._monitor_thread: Optional[threading.Thread] = None
# Initialize tracemalloc if enabled
if self.enable_tracemalloc and not tracemalloc.is_tracing():
tracemalloc.start()
logger.info("Started tracemalloc for detailed memory tracking")
logger.info(f"Memory monitor initialized (tracemalloc: {self.enable_tracemalloc})")
def start_monitoring(self) -> None:
"""Start memory monitoring"""
if self._monitoring:
logger.warning("Memory monitoring already running")
return
self._monitoring = True
self._monitor_thread = threading.Thread(
target=self._monitoring_loop,
name="MemoryMonitor",
daemon=True
)
self._monitor_thread.start()
logger.info("Started memory monitoring")
def stop_monitoring(self) -> None:
"""Stop memory monitoring"""
if not self._monitoring:
return
self._monitoring = False
if self._monitor_thread:
self._monitor_thread.join(timeout=5.0)
logger.info("Stopped memory monitoring")
def _monitoring_loop(self) -> None:
"""Main monitoring loop"""
import time
while self._monitoring:
try:
# Take memory snapshot
self._take_memory_snapshot()
# Update object counts
self._update_object_counts()
# Check for memory leaks
if self.leak_detection_enabled:
self._check_for_leaks()
# Optimize garbage collection
if self.auto_gc_enabled:
self._optimize_gc()
time.sleep(self.snapshot_interval)
except Exception as e:
logger.error(f"Error in memory monitoring loop: {e}")
time.sleep(self.snapshot_interval)
def _take_memory_snapshot(self) -> None:
"""Take a memory usage snapshot"""
try:
import psutil
# Get system memory info
memory = psutil.virtual_memory()
# Get process memory info
process = psutil.Process()
process_memory = process.memory_info()
# Get GC stats
gc_collections = {i: gc.get_count()[i] for i in range(3)}
gc_objects = len(gc.get_objects())
# Get tracemalloc stats if enabled
tracemalloc_current_mb = None
tracemalloc_peak_mb = None
if self.enable_tracemalloc and tracemalloc.is_tracing():
current, peak = tracemalloc.get_traced_memory()
tracemalloc_current_mb = current / (1024 * 1024)
tracemalloc_peak_mb = peak / (1024 * 1024)
# Create snapshot
snapshot = MemorySnapshot(
timestamp=get_current_timestamp(),
total_memory_mb=memory.total / (1024 * 1024),
available_memory_mb=memory.available / (1024 * 1024),
process_memory_mb=process_memory.rss / (1024 * 1024),
gc_collections=gc_collections,
gc_objects=gc_objects,
tracemalloc_current_mb=tracemalloc_current_mb,
tracemalloc_peak_mb=tracemalloc_peak_mb
)
self.memory_snapshots.append(snapshot)
# Update metrics
try:
from .metrics_collector import metrics_collector
metrics_collector.set_gauge('memory_total_mb', snapshot.total_memory_mb)
metrics_collector.set_gauge('memory_available_mb', snapshot.available_memory_mb)
metrics_collector.set_gauge('memory_process_mb', snapshot.process_memory_mb)
metrics_collector.set_gauge('memory_gc_objects', snapshot.gc_objects)
if tracemalloc_current_mb is not None:
metrics_collector.set_gauge('memory_tracemalloc_current_mb', tracemalloc_current_mb)
metrics_collector.set_gauge('memory_tracemalloc_peak_mb', tracemalloc_peak_mb)
except ImportError:
pass # Metrics collector not available
except Exception as e:
logger.error(f"Error taking memory snapshot: {e}")
def _update_object_counts(self) -> None:
"""Update object counts by type"""
try:
# Count objects by type
object_counts = defaultdict(int)
for obj in gc.get_objects():
obj_type = type(obj).__name__
object_counts[obj_type] += 1
# Store counts with timestamp
timestamp = get_current_timestamp()
for obj_type, count in object_counts.items():
self.object_counts[obj_type].append((timestamp, count))
# Update metrics for common types
try:
from .metrics_collector import metrics_collector
common_types = ['dict', 'list', 'tuple', 'str', 'function', 'type']
for obj_type in common_types:
if obj_type in object_counts:
metrics_collector.set_gauge(
f'memory_objects_{obj_type}',
object_counts[obj_type]
)
except ImportError:
pass # Metrics collector not available
except Exception as e:
logger.error(f"Error updating object counts: {e}")
def _check_for_leaks(self) -> None:
"""Check for potential memory leaks"""
try:
if len(self.memory_snapshots) < 10:
return # Need more data
# Check for consistent memory growth
recent_snapshots = list(self.memory_snapshots)[-10:]
memory_values = [s.process_memory_mb for s in recent_snapshots]
# Simple linear regression to detect growth trend
if self._is_memory_growing(memory_values):
# Check object count growth
potential_leaks = self._analyze_object_growth()
for leak in potential_leaks:
if leak not in self.detected_leaks:
self.detected_leaks.append(leak)
logger.warning(f"Potential memory leak detected: {leak.object_type}")
# Record leak detection
try:
from .metrics_collector import metrics_collector
metrics_collector.increment_counter('memory_leaks_detected')
except ImportError:
pass
except Exception as e:
logger.error(f"Error checking for leaks: {e}")
def _is_memory_growing(self, memory_values: List[float], threshold: float = 5.0) -> bool:
"""Check if memory is consistently growing"""
if len(memory_values) < 5:
return False
# Check if memory increased by more than threshold MB
growth = memory_values[-1] - memory_values[0]
return growth > threshold
def _analyze_object_growth(self) -> List[MemoryLeak]:
"""Analyze object count growth to identify potential leaks"""
leaks = []
for obj_type, counts in self.object_counts.items():
if len(counts) < 10:
continue
# Get recent counts
recent_counts = list(counts)[-10:]
timestamps = [item[0] for item in recent_counts]
count_values = [item[1] for item in recent_counts]
# Check for growth
if len(count_values) >= 2:
growth = count_values[-1] - count_values[0]
time_diff = (timestamps[-1] - timestamps[0]).total_seconds() / 3600 # hours
if growth > 100 and time_diff > 0: # More than 100 objects growth
growth_rate = growth / time_diff
# Determine severity
if growth_rate > 1000:
severity = 'high'
elif growth_rate > 100:
severity = 'medium'
else:
severity = 'low'
leak = MemoryLeak(
object_type=obj_type,
count_increase=growth,
size_increase_mb=growth * 0.001, # Rough estimate
growth_rate_per_hour=growth_rate,
severity=severity
)
leaks.append(leak)
return leaks
def _optimize_gc(self) -> None:
"""Optimize garbage collection based on memory usage"""
try:
if not self.memory_snapshots:
return
latest_snapshot = self.memory_snapshots[-1]
memory_usage_percent = (
(latest_snapshot.total_memory_mb - latest_snapshot.available_memory_mb) /
latest_snapshot.total_memory_mb * 100
)
# Adjust GC thresholds based on memory pressure
if memory_usage_percent > 85:
# High memory pressure - more aggressive GC
new_thresholds = (500, 10, 10)
if gc.get_threshold() != new_thresholds:
gc.set_threshold(*new_thresholds)
logger.info("Enabled aggressive garbage collection due to high memory usage")
# Force collection
collected = gc.collect()
metrics_collector.increment_counter('memory_gc_forced')
logger.debug(f"Forced GC collected {collected} objects")
elif memory_usage_percent < 50:
# Low memory pressure - less aggressive GC
new_thresholds = (1000, 20, 20)
if gc.get_threshold() != new_thresholds:
gc.set_threshold(*new_thresholds)
logger.info("Reduced garbage collection frequency due to low memory usage")
# Update GC stats
self.gc_stats = {
'threshold': gc.get_threshold(),
'counts': gc.get_count(),
'collections': gc.get_stats()
}
except Exception as e:
logger.error(f"Error optimizing GC: {e}")
def force_garbage_collection(self) -> Dict[str, int]:
"""Force garbage collection and return statistics"""
try:
# Get counts before collection
before_counts = gc.get_count()
before_objects = len(gc.get_objects())
# Force collection for all generations
collected = [gc.collect(generation) for generation in range(3)]
total_collected = sum(collected)
# Get counts after collection
after_counts = gc.get_count()
after_objects = len(gc.get_objects())
# Update metrics
try:
from .metrics_collector import metrics_collector
metrics_collector.increment_counter('memory_gc_manual')
metrics_collector.set_gauge('memory_gc_objects_collected', total_collected)
except ImportError:
pass
result = {
'total_collected': total_collected,
'by_generation': collected,
'objects_before': before_objects,
'objects_after': after_objects,
'objects_freed': before_objects - after_objects,
'counts_before': before_counts,
'counts_after': after_counts
}
logger.info(f"Manual GC collected {total_collected} objects, freed {result['objects_freed']} objects")
return result
except Exception as e:
logger.error(f"Error during forced garbage collection: {e}")
return {}
def get_memory_usage_summary(self) -> Dict[str, Any]:
"""Get current memory usage summary"""
if not self.memory_snapshots:
return {}
latest = self.memory_snapshots[-1]
# Calculate memory usage percentage
memory_usage_percent = (
(latest.total_memory_mb - latest.available_memory_mb) /
latest.total_memory_mb * 100
)
return {
'timestamp': latest.timestamp.isoformat(),
'total_memory_mb': latest.total_memory_mb,
'available_memory_mb': latest.available_memory_mb,
'used_memory_mb': latest.total_memory_mb - latest.available_memory_mb,
'memory_usage_percent': memory_usage_percent,
'process_memory_mb': latest.process_memory_mb,
'gc_objects': latest.gc_objects,
'gc_collections': latest.gc_collections,
'tracemalloc_current_mb': latest.tracemalloc_current_mb,
'tracemalloc_peak_mb': latest.tracemalloc_peak_mb
}
def get_memory_trends(self, hours: int = 1) -> Dict[str, Any]:
"""Get memory usage trends over specified time period"""
if not self.memory_snapshots:
return {}
from datetime import timedelta
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
# Filter snapshots
recent_snapshots = [
s for s in self.memory_snapshots
if s.timestamp >= cutoff_time
]
if len(recent_snapshots) < 2:
return {'trend': 'insufficient_data'}
# Calculate trends
process_memory_values = [s.process_memory_mb for s in recent_snapshots]
gc_object_values = [s.gc_objects for s in recent_snapshots]
return {
'process_memory': {
'start_mb': process_memory_values[0],
'end_mb': process_memory_values[-1],
'change_mb': process_memory_values[-1] - process_memory_values[0],
'max_mb': max(process_memory_values),
'min_mb': min(process_memory_values),
'avg_mb': sum(process_memory_values) / len(process_memory_values)
},
'gc_objects': {
'start_count': gc_object_values[0],
'end_count': gc_object_values[-1],
'change_count': gc_object_values[-1] - gc_object_values[0],
'max_count': max(gc_object_values),
'min_count': min(gc_object_values),
'avg_count': sum(gc_object_values) / len(gc_object_values)
},
'sample_count': len(recent_snapshots),
'time_period_hours': hours
}
def get_top_memory_consumers(self, limit: int = 10) -> List[Dict[str, Any]]:
"""Get top memory consuming object types"""
if not self.object_counts:
return []
# Get latest counts
latest_counts = {}
for obj_type, counts in self.object_counts.items():
if counts:
latest_counts[obj_type] = counts[-1][1] # Get count from (timestamp, count) tuple
# Sort by count
sorted_types = sorted(
latest_counts.items(),
key=lambda x: x[1],
reverse=True
)
return [
{
'object_type': obj_type,
'count': count,
'estimated_size_mb': count * 0.001 # Rough estimate
}
for obj_type, count in sorted_types[:limit]
]
def get_detected_leaks(self) -> List[Dict[str, Any]]:
"""Get detected memory leaks"""
return [
{
'object_type': leak.object_type,
'count_increase': leak.count_increase,
'size_increase_mb': leak.size_increase_mb,
'growth_rate_per_hour': leak.growth_rate_per_hour,
'severity': leak.severity
}
for leak in self.detected_leaks
]
def get_tracemalloc_top(self, limit: int = 10) -> List[Dict[str, Any]]:
"""Get top memory allocations from tracemalloc"""
if not self.enable_tracemalloc or not tracemalloc.is_tracing():
return []
try:
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
return [
{
'filename': stat.traceback.format()[0],
'size_mb': stat.size / (1024 * 1024),
'count': stat.count
}
for stat in top_stats[:limit]
]
except Exception as e:
logger.error(f"Error getting tracemalloc top: {e}")
return []
def clear_leak_history(self) -> None:
"""Clear detected leak history"""
self.detected_leaks.clear()
logger.info("Cleared memory leak history")
def get_gc_stats(self) -> Dict[str, Any]:
"""Get garbage collection statistics"""
return {
'thresholds': gc.get_threshold(),
'counts': gc.get_count(),
'stats': gc.get_stats(),
'auto_gc_enabled': self.auto_gc_enabled,
'is_enabled': gc.isenabled()
}
def set_gc_thresholds(self, gen0: int, gen1: int, gen2: int) -> None:
"""Set garbage collection thresholds"""
gc.set_threshold(gen0, gen1, gen2)
logger.info(f"Set GC thresholds to ({gen0}, {gen1}, {gen2})")
def enable_auto_gc_optimization(self, enabled: bool = True) -> None:
"""Enable or disable automatic GC optimization"""
self.auto_gc_enabled = enabled
logger.info(f"Auto GC optimization {'enabled' if enabled else 'disabled'}")
def enable_leak_detection(self, enabled: bool = True) -> None:
"""Enable or disable memory leak detection"""
self.leak_detection_enabled = enabled
logger.info(f"Memory leak detection {'enabled' if enabled else 'disabled'}")
def get_stats(self) -> Dict[str, Any]:
"""Get memory monitor statistics"""
return {
'monitoring': self._monitoring,
'snapshot_interval': self.snapshot_interval,
'snapshots_count': len(self.memory_snapshots),
'object_types_tracked': len(self.object_counts),
'detected_leaks': len(self.detected_leaks),
'tracemalloc_enabled': self.enable_tracemalloc and tracemalloc.is_tracing(),
'auto_gc_enabled': self.auto_gc_enabled,
'leak_detection_enabled': self.leak_detection_enabled,
'gc_thresholds': gc.get_threshold()
}
# Global memory monitor instance
memory_monitor = MemoryMonitor()