565 lines
21 KiB
Python
565 lines
21 KiB
Python
"""
|
|
Memory usage monitoring and garbage collection optimization.
|
|
"""
|
|
|
|
import gc
|
|
import sys
|
|
import threading
|
|
import tracemalloc
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from collections import defaultdict, deque
|
|
from datetime import datetime, timezone
|
|
from dataclasses import dataclass
|
|
|
|
try:
|
|
from ..utils.logging import get_logger
|
|
from ..utils.timing import get_current_timestamp
|
|
except ImportError:
|
|
from utils.logging import get_logger
|
|
from utils.timing import get_current_timestamp
|
|
# Import will be done lazily to avoid circular imports
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class MemorySnapshot:
|
|
"""Memory usage snapshot"""
|
|
timestamp: datetime
|
|
total_memory_mb: float
|
|
available_memory_mb: float
|
|
process_memory_mb: float
|
|
gc_collections: Dict[int, int]
|
|
gc_objects: int
|
|
tracemalloc_current_mb: Optional[float] = None
|
|
tracemalloc_peak_mb: Optional[float] = None
|
|
|
|
|
|
@dataclass
|
|
class MemoryLeak:
|
|
"""Memory leak detection result"""
|
|
object_type: str
|
|
count_increase: int
|
|
size_increase_mb: float
|
|
growth_rate_per_hour: float
|
|
severity: str # 'low', 'medium', 'high'
|
|
|
|
|
|
class MemoryMonitor:
|
|
"""
|
|
Monitors memory usage, detects leaks, and optimizes garbage collection.
|
|
|
|
Provides detailed memory analytics and automatic GC optimization.
|
|
"""
|
|
|
|
def __init__(self, enable_tracemalloc: bool = True, snapshot_interval: float = 30.0):
|
|
"""
|
|
Initialize memory monitor.
|
|
|
|
Args:
|
|
enable_tracemalloc: Whether to enable detailed memory tracing
|
|
snapshot_interval: How often to take memory snapshots (seconds)
|
|
"""
|
|
self.enable_tracemalloc = enable_tracemalloc
|
|
self.snapshot_interval = snapshot_interval
|
|
|
|
# Memory tracking
|
|
self.memory_snapshots: deque = deque(maxlen=1000)
|
|
self.object_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
|
|
|
|
# GC optimization
|
|
self.gc_stats: Dict[str, Any] = {}
|
|
self.gc_thresholds = gc.get_threshold()
|
|
self.auto_gc_enabled = True
|
|
|
|
# Leak detection
|
|
self.leak_detection_enabled = True
|
|
self.detected_leaks: List[MemoryLeak] = []
|
|
|
|
# Monitoring control
|
|
self._monitoring = False
|
|
self._monitor_thread: Optional[threading.Thread] = None
|
|
|
|
# Initialize tracemalloc if enabled
|
|
if self.enable_tracemalloc and not tracemalloc.is_tracing():
|
|
tracemalloc.start()
|
|
logger.info("Started tracemalloc for detailed memory tracking")
|
|
|
|
logger.info(f"Memory monitor initialized (tracemalloc: {self.enable_tracemalloc})")
|
|
|
|
def start_monitoring(self) -> None:
|
|
"""Start memory monitoring"""
|
|
if self._monitoring:
|
|
logger.warning("Memory monitoring already running")
|
|
return
|
|
|
|
self._monitoring = True
|
|
self._monitor_thread = threading.Thread(
|
|
target=self._monitoring_loop,
|
|
name="MemoryMonitor",
|
|
daemon=True
|
|
)
|
|
self._monitor_thread.start()
|
|
logger.info("Started memory monitoring")
|
|
|
|
def stop_monitoring(self) -> None:
|
|
"""Stop memory monitoring"""
|
|
if not self._monitoring:
|
|
return
|
|
|
|
self._monitoring = False
|
|
if self._monitor_thread:
|
|
self._monitor_thread.join(timeout=5.0)
|
|
logger.info("Stopped memory monitoring")
|
|
|
|
def _monitoring_loop(self) -> None:
|
|
"""Main monitoring loop"""
|
|
import time
|
|
|
|
while self._monitoring:
|
|
try:
|
|
# Take memory snapshot
|
|
self._take_memory_snapshot()
|
|
|
|
# Update object counts
|
|
self._update_object_counts()
|
|
|
|
# Check for memory leaks
|
|
if self.leak_detection_enabled:
|
|
self._check_for_leaks()
|
|
|
|
# Optimize garbage collection
|
|
if self.auto_gc_enabled:
|
|
self._optimize_gc()
|
|
|
|
time.sleep(self.snapshot_interval)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in memory monitoring loop: {e}")
|
|
time.sleep(self.snapshot_interval)
|
|
|
|
def _take_memory_snapshot(self) -> None:
|
|
"""Take a memory usage snapshot"""
|
|
try:
|
|
import psutil
|
|
|
|
# Get system memory info
|
|
memory = psutil.virtual_memory()
|
|
|
|
# Get process memory info
|
|
process = psutil.Process()
|
|
process_memory = process.memory_info()
|
|
|
|
# Get GC stats
|
|
gc_collections = {i: gc.get_count()[i] for i in range(3)}
|
|
gc_objects = len(gc.get_objects())
|
|
|
|
# Get tracemalloc stats if enabled
|
|
tracemalloc_current_mb = None
|
|
tracemalloc_peak_mb = None
|
|
|
|
if self.enable_tracemalloc and tracemalloc.is_tracing():
|
|
current, peak = tracemalloc.get_traced_memory()
|
|
tracemalloc_current_mb = current / (1024 * 1024)
|
|
tracemalloc_peak_mb = peak / (1024 * 1024)
|
|
|
|
# Create snapshot
|
|
snapshot = MemorySnapshot(
|
|
timestamp=get_current_timestamp(),
|
|
total_memory_mb=memory.total / (1024 * 1024),
|
|
available_memory_mb=memory.available / (1024 * 1024),
|
|
process_memory_mb=process_memory.rss / (1024 * 1024),
|
|
gc_collections=gc_collections,
|
|
gc_objects=gc_objects,
|
|
tracemalloc_current_mb=tracemalloc_current_mb,
|
|
tracemalloc_peak_mb=tracemalloc_peak_mb
|
|
)
|
|
|
|
self.memory_snapshots.append(snapshot)
|
|
|
|
# Update metrics
|
|
try:
|
|
from .metrics_collector import metrics_collector
|
|
metrics_collector.set_gauge('memory_total_mb', snapshot.total_memory_mb)
|
|
metrics_collector.set_gauge('memory_available_mb', snapshot.available_memory_mb)
|
|
metrics_collector.set_gauge('memory_process_mb', snapshot.process_memory_mb)
|
|
metrics_collector.set_gauge('memory_gc_objects', snapshot.gc_objects)
|
|
|
|
if tracemalloc_current_mb is not None:
|
|
metrics_collector.set_gauge('memory_tracemalloc_current_mb', tracemalloc_current_mb)
|
|
metrics_collector.set_gauge('memory_tracemalloc_peak_mb', tracemalloc_peak_mb)
|
|
except ImportError:
|
|
pass # Metrics collector not available
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error taking memory snapshot: {e}")
|
|
|
|
def _update_object_counts(self) -> None:
|
|
"""Update object counts by type"""
|
|
try:
|
|
# Count objects by type
|
|
object_counts = defaultdict(int)
|
|
|
|
for obj in gc.get_objects():
|
|
obj_type = type(obj).__name__
|
|
object_counts[obj_type] += 1
|
|
|
|
# Store counts with timestamp
|
|
timestamp = get_current_timestamp()
|
|
for obj_type, count in object_counts.items():
|
|
self.object_counts[obj_type].append((timestamp, count))
|
|
|
|
# Update metrics for common types
|
|
try:
|
|
from .metrics_collector import metrics_collector
|
|
common_types = ['dict', 'list', 'tuple', 'str', 'function', 'type']
|
|
for obj_type in common_types:
|
|
if obj_type in object_counts:
|
|
metrics_collector.set_gauge(
|
|
f'memory_objects_{obj_type}',
|
|
object_counts[obj_type]
|
|
)
|
|
except ImportError:
|
|
pass # Metrics collector not available
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating object counts: {e}")
|
|
|
|
def _check_for_leaks(self) -> None:
|
|
"""Check for potential memory leaks"""
|
|
try:
|
|
if len(self.memory_snapshots) < 10:
|
|
return # Need more data
|
|
|
|
# Check for consistent memory growth
|
|
recent_snapshots = list(self.memory_snapshots)[-10:]
|
|
memory_values = [s.process_memory_mb for s in recent_snapshots]
|
|
|
|
# Simple linear regression to detect growth trend
|
|
if self._is_memory_growing(memory_values):
|
|
# Check object count growth
|
|
potential_leaks = self._analyze_object_growth()
|
|
|
|
for leak in potential_leaks:
|
|
if leak not in self.detected_leaks:
|
|
self.detected_leaks.append(leak)
|
|
logger.warning(f"Potential memory leak detected: {leak.object_type}")
|
|
|
|
# Record leak detection
|
|
try:
|
|
from .metrics_collector import metrics_collector
|
|
metrics_collector.increment_counter('memory_leaks_detected')
|
|
except ImportError:
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking for leaks: {e}")
|
|
|
|
def _is_memory_growing(self, memory_values: List[float], threshold: float = 5.0) -> bool:
|
|
"""Check if memory is consistently growing"""
|
|
if len(memory_values) < 5:
|
|
return False
|
|
|
|
# Check if memory increased by more than threshold MB
|
|
growth = memory_values[-1] - memory_values[0]
|
|
return growth > threshold
|
|
|
|
def _analyze_object_growth(self) -> List[MemoryLeak]:
|
|
"""Analyze object count growth to identify potential leaks"""
|
|
leaks = []
|
|
|
|
for obj_type, counts in self.object_counts.items():
|
|
if len(counts) < 10:
|
|
continue
|
|
|
|
# Get recent counts
|
|
recent_counts = list(counts)[-10:]
|
|
timestamps = [item[0] for item in recent_counts]
|
|
count_values = [item[1] for item in recent_counts]
|
|
|
|
# Check for growth
|
|
if len(count_values) >= 2:
|
|
growth = count_values[-1] - count_values[0]
|
|
time_diff = (timestamps[-1] - timestamps[0]).total_seconds() / 3600 # hours
|
|
|
|
if growth > 100 and time_diff > 0: # More than 100 objects growth
|
|
growth_rate = growth / time_diff
|
|
|
|
# Determine severity
|
|
if growth_rate > 1000:
|
|
severity = 'high'
|
|
elif growth_rate > 100:
|
|
severity = 'medium'
|
|
else:
|
|
severity = 'low'
|
|
|
|
leak = MemoryLeak(
|
|
object_type=obj_type,
|
|
count_increase=growth,
|
|
size_increase_mb=growth * 0.001, # Rough estimate
|
|
growth_rate_per_hour=growth_rate,
|
|
severity=severity
|
|
)
|
|
leaks.append(leak)
|
|
|
|
return leaks
|
|
|
|
def _optimize_gc(self) -> None:
|
|
"""Optimize garbage collection based on memory usage"""
|
|
try:
|
|
if not self.memory_snapshots:
|
|
return
|
|
|
|
latest_snapshot = self.memory_snapshots[-1]
|
|
memory_usage_percent = (
|
|
(latest_snapshot.total_memory_mb - latest_snapshot.available_memory_mb) /
|
|
latest_snapshot.total_memory_mb * 100
|
|
)
|
|
|
|
# Adjust GC thresholds based on memory pressure
|
|
if memory_usage_percent > 85:
|
|
# High memory pressure - more aggressive GC
|
|
new_thresholds = (500, 10, 10)
|
|
if gc.get_threshold() != new_thresholds:
|
|
gc.set_threshold(*new_thresholds)
|
|
logger.info("Enabled aggressive garbage collection due to high memory usage")
|
|
|
|
# Force collection
|
|
collected = gc.collect()
|
|
metrics_collector.increment_counter('memory_gc_forced')
|
|
logger.debug(f"Forced GC collected {collected} objects")
|
|
|
|
elif memory_usage_percent < 50:
|
|
# Low memory pressure - less aggressive GC
|
|
new_thresholds = (1000, 20, 20)
|
|
if gc.get_threshold() != new_thresholds:
|
|
gc.set_threshold(*new_thresholds)
|
|
logger.info("Reduced garbage collection frequency due to low memory usage")
|
|
|
|
# Update GC stats
|
|
self.gc_stats = {
|
|
'threshold': gc.get_threshold(),
|
|
'counts': gc.get_count(),
|
|
'collections': gc.get_stats()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error optimizing GC: {e}")
|
|
|
|
def force_garbage_collection(self) -> Dict[str, int]:
|
|
"""Force garbage collection and return statistics"""
|
|
try:
|
|
# Get counts before collection
|
|
before_counts = gc.get_count()
|
|
before_objects = len(gc.get_objects())
|
|
|
|
# Force collection for all generations
|
|
collected = [gc.collect(generation) for generation in range(3)]
|
|
total_collected = sum(collected)
|
|
|
|
# Get counts after collection
|
|
after_counts = gc.get_count()
|
|
after_objects = len(gc.get_objects())
|
|
|
|
# Update metrics
|
|
try:
|
|
from .metrics_collector import metrics_collector
|
|
metrics_collector.increment_counter('memory_gc_manual')
|
|
metrics_collector.set_gauge('memory_gc_objects_collected', total_collected)
|
|
except ImportError:
|
|
pass
|
|
|
|
result = {
|
|
'total_collected': total_collected,
|
|
'by_generation': collected,
|
|
'objects_before': before_objects,
|
|
'objects_after': after_objects,
|
|
'objects_freed': before_objects - after_objects,
|
|
'counts_before': before_counts,
|
|
'counts_after': after_counts
|
|
}
|
|
|
|
logger.info(f"Manual GC collected {total_collected} objects, freed {result['objects_freed']} objects")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during forced garbage collection: {e}")
|
|
return {}
|
|
|
|
def get_memory_usage_summary(self) -> Dict[str, Any]:
|
|
"""Get current memory usage summary"""
|
|
if not self.memory_snapshots:
|
|
return {}
|
|
|
|
latest = self.memory_snapshots[-1]
|
|
|
|
# Calculate memory usage percentage
|
|
memory_usage_percent = (
|
|
(latest.total_memory_mb - latest.available_memory_mb) /
|
|
latest.total_memory_mb * 100
|
|
)
|
|
|
|
return {
|
|
'timestamp': latest.timestamp.isoformat(),
|
|
'total_memory_mb': latest.total_memory_mb,
|
|
'available_memory_mb': latest.available_memory_mb,
|
|
'used_memory_mb': latest.total_memory_mb - latest.available_memory_mb,
|
|
'memory_usage_percent': memory_usage_percent,
|
|
'process_memory_mb': latest.process_memory_mb,
|
|
'gc_objects': latest.gc_objects,
|
|
'gc_collections': latest.gc_collections,
|
|
'tracemalloc_current_mb': latest.tracemalloc_current_mb,
|
|
'tracemalloc_peak_mb': latest.tracemalloc_peak_mb
|
|
}
|
|
|
|
def get_memory_trends(self, hours: int = 1) -> Dict[str, Any]:
|
|
"""Get memory usage trends over specified time period"""
|
|
if not self.memory_snapshots:
|
|
return {}
|
|
|
|
from datetime import timedelta
|
|
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
|
|
|
|
# Filter snapshots
|
|
recent_snapshots = [
|
|
s for s in self.memory_snapshots
|
|
if s.timestamp >= cutoff_time
|
|
]
|
|
|
|
if len(recent_snapshots) < 2:
|
|
return {'trend': 'insufficient_data'}
|
|
|
|
# Calculate trends
|
|
process_memory_values = [s.process_memory_mb for s in recent_snapshots]
|
|
gc_object_values = [s.gc_objects for s in recent_snapshots]
|
|
|
|
return {
|
|
'process_memory': {
|
|
'start_mb': process_memory_values[0],
|
|
'end_mb': process_memory_values[-1],
|
|
'change_mb': process_memory_values[-1] - process_memory_values[0],
|
|
'max_mb': max(process_memory_values),
|
|
'min_mb': min(process_memory_values),
|
|
'avg_mb': sum(process_memory_values) / len(process_memory_values)
|
|
},
|
|
'gc_objects': {
|
|
'start_count': gc_object_values[0],
|
|
'end_count': gc_object_values[-1],
|
|
'change_count': gc_object_values[-1] - gc_object_values[0],
|
|
'max_count': max(gc_object_values),
|
|
'min_count': min(gc_object_values),
|
|
'avg_count': sum(gc_object_values) / len(gc_object_values)
|
|
},
|
|
'sample_count': len(recent_snapshots),
|
|
'time_period_hours': hours
|
|
}
|
|
|
|
def get_top_memory_consumers(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
"""Get top memory consuming object types"""
|
|
if not self.object_counts:
|
|
return []
|
|
|
|
# Get latest counts
|
|
latest_counts = {}
|
|
for obj_type, counts in self.object_counts.items():
|
|
if counts:
|
|
latest_counts[obj_type] = counts[-1][1] # Get count from (timestamp, count) tuple
|
|
|
|
# Sort by count
|
|
sorted_types = sorted(
|
|
latest_counts.items(),
|
|
key=lambda x: x[1],
|
|
reverse=True
|
|
)
|
|
|
|
return [
|
|
{
|
|
'object_type': obj_type,
|
|
'count': count,
|
|
'estimated_size_mb': count * 0.001 # Rough estimate
|
|
}
|
|
for obj_type, count in sorted_types[:limit]
|
|
]
|
|
|
|
def get_detected_leaks(self) -> List[Dict[str, Any]]:
|
|
"""Get detected memory leaks"""
|
|
return [
|
|
{
|
|
'object_type': leak.object_type,
|
|
'count_increase': leak.count_increase,
|
|
'size_increase_mb': leak.size_increase_mb,
|
|
'growth_rate_per_hour': leak.growth_rate_per_hour,
|
|
'severity': leak.severity
|
|
}
|
|
for leak in self.detected_leaks
|
|
]
|
|
|
|
def get_tracemalloc_top(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
"""Get top memory allocations from tracemalloc"""
|
|
if not self.enable_tracemalloc or not tracemalloc.is_tracing():
|
|
return []
|
|
|
|
try:
|
|
snapshot = tracemalloc.take_snapshot()
|
|
top_stats = snapshot.statistics('lineno')
|
|
|
|
return [
|
|
{
|
|
'filename': stat.traceback.format()[0],
|
|
'size_mb': stat.size / (1024 * 1024),
|
|
'count': stat.count
|
|
}
|
|
for stat in top_stats[:limit]
|
|
]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting tracemalloc top: {e}")
|
|
return []
|
|
|
|
def clear_leak_history(self) -> None:
|
|
"""Clear detected leak history"""
|
|
self.detected_leaks.clear()
|
|
logger.info("Cleared memory leak history")
|
|
|
|
def get_gc_stats(self) -> Dict[str, Any]:
|
|
"""Get garbage collection statistics"""
|
|
return {
|
|
'thresholds': gc.get_threshold(),
|
|
'counts': gc.get_count(),
|
|
'stats': gc.get_stats(),
|
|
'auto_gc_enabled': self.auto_gc_enabled,
|
|
'is_enabled': gc.isenabled()
|
|
}
|
|
|
|
def set_gc_thresholds(self, gen0: int, gen1: int, gen2: int) -> None:
|
|
"""Set garbage collection thresholds"""
|
|
gc.set_threshold(gen0, gen1, gen2)
|
|
logger.info(f"Set GC thresholds to ({gen0}, {gen1}, {gen2})")
|
|
|
|
def enable_auto_gc_optimization(self, enabled: bool = True) -> None:
|
|
"""Enable or disable automatic GC optimization"""
|
|
self.auto_gc_enabled = enabled
|
|
logger.info(f"Auto GC optimization {'enabled' if enabled else 'disabled'}")
|
|
|
|
def enable_leak_detection(self, enabled: bool = True) -> None:
|
|
"""Enable or disable memory leak detection"""
|
|
self.leak_detection_enabled = enabled
|
|
logger.info(f"Memory leak detection {'enabled' if enabled else 'disabled'}")
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get memory monitor statistics"""
|
|
return {
|
|
'monitoring': self._monitoring,
|
|
'snapshot_interval': self.snapshot_interval,
|
|
'snapshots_count': len(self.memory_snapshots),
|
|
'object_types_tracked': len(self.object_counts),
|
|
'detected_leaks': len(self.detected_leaks),
|
|
'tracemalloc_enabled': self.enable_tracemalloc and tracemalloc.is_tracing(),
|
|
'auto_gc_enabled': self.auto_gc_enabled,
|
|
'leak_detection_enabled': self.leak_detection_enabled,
|
|
'gc_thresholds': gc.get_threshold()
|
|
}
|
|
|
|
|
|
# Global memory monitor instance
|
|
memory_monitor = MemoryMonitor() |