gogo2/model_manager.py
2025-05-30 22:33:41 +03:00

558 lines
22 KiB
Python

"""
Enhanced Model Management System for Trading Dashboard
This system provides:
- Automatic cleanup of old model checkpoints
- Best model tracking with performance metrics
- Configurable retention policies
- Startup model loading
- Performance-based model selection
"""
import os
import json
import shutil
import logging
import torch
import glob
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict
from pathlib import Path
import numpy as np
logger = logging.getLogger(__name__)
@dataclass
class ModelMetrics:
"""Performance metrics for model evaluation"""
accuracy: float = 0.0
profit_factor: float = 0.0
win_rate: float = 0.0
sharpe_ratio: float = 0.0
max_drawdown: float = 0.0
total_trades: int = 0
avg_trade_duration: float = 0.0
confidence_score: float = 0.0
def get_composite_score(self) -> float:
"""Calculate composite performance score"""
# Weighted composite score
weights = {
'profit_factor': 0.3,
'sharpe_ratio': 0.25,
'win_rate': 0.2,
'accuracy': 0.15,
'confidence_score': 0.1
}
# Normalize values to 0-1 range
normalized_pf = min(max(self.profit_factor / 3.0, 0), 1) # PF of 3+ = 1.0
normalized_sharpe = min(max((self.sharpe_ratio + 2) / 4, 0), 1) # Sharpe -2 to 2 -> 0 to 1
normalized_win_rate = self.win_rate
normalized_accuracy = self.accuracy
normalized_confidence = self.confidence_score
# Apply penalties for poor performance
drawdown_penalty = max(0, 1 - self.max_drawdown / 0.2) # Penalty for >20% drawdown
score = (
weights['profit_factor'] * normalized_pf +
weights['sharpe_ratio'] * normalized_sharpe +
weights['win_rate'] * normalized_win_rate +
weights['accuracy'] * normalized_accuracy +
weights['confidence_score'] * normalized_confidence
) * drawdown_penalty
return min(max(score, 0), 1)
@dataclass
class ModelInfo:
"""Complete model information and metadata"""
model_type: str # 'cnn', 'rl', 'transformer'
model_name: str
file_path: str
creation_time: datetime
last_updated: datetime
file_size_mb: float
metrics: ModelMetrics
training_episodes: int = 0
model_version: str = "1.0"
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON serialization"""
data = asdict(self)
data['creation_time'] = self.creation_time.isoformat()
data['last_updated'] = self.last_updated.isoformat()
return data
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ModelInfo':
"""Create from dictionary"""
data['creation_time'] = datetime.fromisoformat(data['creation_time'])
data['last_updated'] = datetime.fromisoformat(data['last_updated'])
data['metrics'] = ModelMetrics(**data['metrics'])
return cls(**data)
class ModelManager:
"""Enhanced model management system"""
def __init__(self, base_dir: str = ".", config: Optional[Dict[str, Any]] = None):
self.base_dir = Path(base_dir)
self.config = config or self._get_default_config()
# Model directories
self.models_dir = self.base_dir / "models"
self.nn_models_dir = self.base_dir / "NN" / "models"
self.registry_file = self.models_dir / "model_registry.json"
self.best_models_dir = self.models_dir / "best_models"
# Create directories
self.best_models_dir.mkdir(parents=True, exist_ok=True)
# Model registry
self.model_registry: Dict[str, ModelInfo] = {}
self._load_registry()
logger.info(f"Model Manager initialized - Base: {self.base_dir}")
logger.info(f"Retention policy: Keep {self.config['max_models_per_type']} best models per type")
def _get_default_config(self) -> Dict[str, Any]:
"""Get default configuration"""
return {
'max_models_per_type': 3, # Keep top 3 models per type
'max_total_models': 10, # Maximum total models to keep
'cleanup_frequency_hours': 24, # Cleanup every 24 hours
'min_performance_threshold': 0.3, # Minimum composite score
'max_checkpoint_age_days': 7, # Delete checkpoints older than 7 days
'auto_cleanup_enabled': True,
'backup_before_cleanup': True,
'model_size_limit_mb': 100, # Individual model size limit
'total_storage_limit_gb': 5.0 # Total storage limit
}
def _load_registry(self):
"""Load model registry from file"""
try:
if self.registry_file.exists():
with open(self.registry_file, 'r') as f:
data = json.load(f)
self.model_registry = {
k: ModelInfo.from_dict(v) for k, v in data.items()
}
logger.info(f"Loaded {len(self.model_registry)} models from registry")
else:
logger.info("No existing model registry found")
except Exception as e:
logger.error(f"Error loading model registry: {e}")
self.model_registry = {}
def _save_registry(self):
"""Save model registry to file"""
try:
self.models_dir.mkdir(parents=True, exist_ok=True)
with open(self.registry_file, 'w') as f:
data = {k: v.to_dict() for k, v in self.model_registry.items()}
json.dump(data, f, indent=2, default=str)
logger.info(f"Saved registry with {len(self.model_registry)} models")
except Exception as e:
logger.error(f"Error saving model registry: {e}")
def cleanup_all_existing_models(self, confirm: bool = False) -> Dict[str, Any]:
"""
Clean up all existing model files and prepare for 2-action system training
Args:
confirm: If True, perform the cleanup. If False, return what would be cleaned
Returns:
Dict with cleanup statistics
"""
cleanup_stats = {
'files_found': 0,
'files_deleted': 0,
'directories_cleaned': 0,
'space_freed_mb': 0.0,
'errors': []
}
# Model file patterns for both 2-action and legacy 3-action systems
model_patterns = [
"**/*.pt", "**/*.pth", "**/*.h5", "**/*.pkl", "**/*.joblib", "**/*.model",
"**/checkpoint_*", "**/model_*", "**/cnn_*", "**/dqn_*", "**/rl_*"
]
# Directories to clean
model_directories = [
"models/saved",
"NN/models/saved",
"NN/models/saved/checkpoints",
"NN/models/saved/realtime_checkpoints",
"NN/models/saved/realtime_ticks_checkpoints",
"model_backups"
]
try:
# Scan for files to be cleaned
for directory in model_directories:
dir_path = Path(self.base_dir) / directory
if dir_path.exists():
for pattern in model_patterns:
for file_path in dir_path.glob(pattern):
if file_path.is_file():
cleanup_stats['files_found'] += 1
file_size = file_path.stat().st_size / (1024 * 1024) # MB
cleanup_stats['space_freed_mb'] += file_size
if confirm:
try:
file_path.unlink()
cleanup_stats['files_deleted'] += 1
logger.info(f"Deleted model file: {file_path}")
except Exception as e:
cleanup_stats['errors'].append(f"Failed to delete {file_path}: {e}")
# Clean up empty checkpoint directories
for directory in model_directories:
dir_path = Path(self.base_dir) / directory
if dir_path.exists():
for subdir in dir_path.rglob("*"):
if subdir.is_dir() and not any(subdir.iterdir()):
if confirm:
try:
subdir.rmdir()
cleanup_stats['directories_cleaned'] += 1
logger.info(f"Removed empty directory: {subdir}")
except Exception as e:
cleanup_stats['errors'].append(f"Failed to remove directory {subdir}: {e}")
if confirm:
# Clear the registry for fresh start with 2-action system
self.model_registry = {
'models': {},
'metadata': {
'last_updated': datetime.now().isoformat(),
'total_models': 0,
'system_type': '2_action', # Mark as 2-action system
'action_space': ['SELL', 'BUY'],
'version': '2.0'
}
}
self._save_registry()
logger.info("=" * 60)
logger.info("MODEL CLEANUP COMPLETED - 2-ACTION SYSTEM READY")
logger.info(f"Files deleted: {cleanup_stats['files_deleted']}")
logger.info(f"Space freed: {cleanup_stats['space_freed_mb']:.2f} MB")
logger.info(f"Directories cleaned: {cleanup_stats['directories_cleaned']}")
logger.info("Registry reset for 2-action system (BUY/SELL)")
logger.info("Ready for fresh training with intelligent position management")
logger.info("=" * 60)
else:
logger.info("=" * 60)
logger.info("MODEL CLEANUP PREVIEW - 2-ACTION SYSTEM MIGRATION")
logger.info(f"Files to delete: {cleanup_stats['files_found']}")
logger.info(f"Space to free: {cleanup_stats['space_freed_mb']:.2f} MB")
logger.info("Run with confirm=True to perform cleanup")
logger.info("=" * 60)
except Exception as e:
cleanup_stats['errors'].append(f"Cleanup error: {e}")
logger.error(f"Error during model cleanup: {e}")
return cleanup_stats
def register_model(self, model_path: str, model_type: str, metrics: Optional[ModelMetrics] = None) -> str:
"""
Register a new model in the 2-action system
Args:
model_path: Path to the model file
model_type: Type of model ('cnn', 'rl', 'transformer')
metrics: Performance metrics
Returns:
str: Unique model name/ID
"""
if not Path(model_path).exists():
raise FileNotFoundError(f"Model file not found: {model_path}")
# Generate unique model name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"{model_type}_2action_{timestamp}"
# Get file info
file_path = Path(model_path)
file_size_mb = file_path.stat().st_size / (1024 * 1024)
# Default metrics for 2-action system
if metrics is None:
metrics = ModelMetrics(
accuracy=0.0,
profit_factor=1.0,
win_rate=0.5,
sharpe_ratio=0.0,
max_drawdown=0.0,
confidence_score=0.5
)
# Create model info
model_info = ModelInfo(
model_type=model_type,
model_name=model_name,
file_path=str(file_path.absolute()),
creation_time=datetime.now(),
last_updated=datetime.now(),
file_size_mb=file_size_mb,
metrics=metrics,
model_version="2.0" # 2-action system version
)
# Add to registry
self.model_registry['models'][model_name] = model_info.to_dict()
self.model_registry['metadata']['total_models'] = len(self.model_registry['models'])
self.model_registry['metadata']['last_updated'] = datetime.now().isoformat()
self.model_registry['metadata']['system_type'] = '2_action'
self.model_registry['metadata']['action_space'] = ['SELL', 'BUY']
self._save_registry()
# Cleanup old models if necessary
self._cleanup_models_by_type(model_type)
logger.info(f"Registered 2-action model: {model_name}")
logger.info(f"Model type: {model_type}, Size: {file_size_mb:.2f} MB")
logger.info(f"Performance score: {metrics.get_composite_score():.4f}")
return model_name
def _should_keep_model(self, model_info: ModelInfo) -> bool:
"""Determine if model should be kept based on performance"""
score = model_info.metrics.get_composite_score()
# Check minimum threshold
if score < self.config['min_performance_threshold']:
return False
# Check size limit
if model_info.file_size_mb > self.config['model_size_limit_mb']:
logger.warning(f"Model too large: {model_info.file_size_mb:.1f}MB > {self.config['model_size_limit_mb']}MB")
return False
# Check if better than existing models of same type
existing_models = self.get_models_by_type(model_info.model_type)
if len(existing_models) >= self.config['max_models_per_type']:
# Find worst performing model
worst_model = min(existing_models.values(), key=lambda m: m.metrics.get_composite_score())
if score <= worst_model.metrics.get_composite_score():
return False
return True
def _cleanup_models_by_type(self, model_type: str):
"""Cleanup old models of specific type, keeping only the best ones"""
models_of_type = self.get_models_by_type(model_type)
max_keep = self.config['max_models_per_type']
if len(models_of_type) <= max_keep:
return
# Sort by performance score
sorted_models = sorted(
models_of_type.items(),
key=lambda x: x[1].metrics.get_composite_score(),
reverse=True
)
# Keep only the best models
models_to_keep = sorted_models[:max_keep]
models_to_remove = sorted_models[max_keep:]
for model_name, model_info in models_to_remove:
try:
# Remove file
model_path = Path(model_info.file_path)
if model_path.exists():
model_path.unlink()
# Remove from registry
del self.model_registry[model_name]
logger.info(f"Removed old model: {model_name} (Score: {model_info.metrics.get_composite_score():.3f})")
except Exception as e:
logger.error(f"Error removing model {model_name}: {e}")
def get_models_by_type(self, model_type: str) -> Dict[str, ModelInfo]:
"""Get all models of a specific type"""
return {
name: info for name, info in self.model_registry.items()
if info.model_type == model_type
}
def get_best_model(self, model_type: str) -> Optional[ModelInfo]:
"""Get the best performing model of a specific type"""
models_of_type = self.get_models_by_type(model_type)
if not models_of_type:
return None
return max(models_of_type.values(), key=lambda m: m.metrics.get_composite_score())
def load_best_models(self) -> Dict[str, Any]:
"""Load the best models for each type"""
loaded_models = {}
for model_type in ['cnn', 'rl', 'transformer']:
best_model = self.get_best_model(model_type)
if best_model:
try:
model_path = Path(best_model.file_path)
if model_path.exists():
# Load the model
model_data = torch.load(model_path, map_location='cpu')
loaded_models[model_type] = {
'model': model_data,
'info': best_model,
'path': str(model_path)
}
logger.info(f"Loaded best {model_type} model: {best_model.model_name} "
f"(Score: {best_model.metrics.get_composite_score():.3f})")
else:
logger.warning(f"Best {model_type} model file not found: {model_path}")
except Exception as e:
logger.error(f"Error loading {model_type} model: {e}")
else:
logger.info(f"No {model_type} model available")
return loaded_models
def update_model_performance(self, model_name: str, metrics: ModelMetrics):
"""Update performance metrics for a model"""
if model_name in self.model_registry:
self.model_registry[model_name].metrics = metrics
self.model_registry[model_name].last_updated = datetime.now()
self._save_registry()
logger.info(f"Updated metrics for {model_name}: Score {metrics.get_composite_score():.3f}")
else:
logger.warning(f"Model {model_name} not found in registry")
def get_storage_stats(self) -> Dict[str, Any]:
"""Get storage usage statistics"""
total_size_mb = 0
model_count = 0
for model_info in self.model_registry.values():
total_size_mb += model_info.file_size_mb
model_count += 1
# Check actual storage usage
actual_size_mb = 0
if self.best_models_dir.exists():
actual_size_mb = sum(
f.stat().st_size for f in self.best_models_dir.rglob('*') if f.is_file()
) / 1024 / 1024
return {
'total_models': model_count,
'registered_size_mb': total_size_mb,
'actual_size_mb': actual_size_mb,
'storage_limit_gb': self.config['total_storage_limit_gb'],
'utilization_percent': (actual_size_mb / 1024) / self.config['total_storage_limit_gb'] * 100,
'models_by_type': {
model_type: len(self.get_models_by_type(model_type))
for model_type in ['cnn', 'rl', 'transformer']
}
}
def get_model_leaderboard(self) -> List[Dict[str, Any]]:
"""Get model performance leaderboard"""
leaderboard = []
for model_name, model_info in self.model_registry.items():
leaderboard.append({
'name': model_name,
'type': model_info.model_type,
'score': model_info.metrics.get_composite_score(),
'profit_factor': model_info.metrics.profit_factor,
'win_rate': model_info.metrics.win_rate,
'sharpe_ratio': model_info.metrics.sharpe_ratio,
'size_mb': model_info.file_size_mb,
'age_days': (datetime.now() - model_info.creation_time).days,
'last_updated': model_info.last_updated.strftime('%Y-%m-%d %H:%M')
})
# Sort by score
leaderboard.sort(key=lambda x: x['score'], reverse=True)
return leaderboard
def cleanup_checkpoints(self) -> Dict[str, Any]:
"""Clean up old checkpoint files"""
cleanup_summary = {
'deleted_files': 0,
'freed_space_mb': 0,
'errors': []
}
cutoff_date = datetime.now() - timedelta(days=self.config['max_checkpoint_age_days'])
# Search for checkpoint files
checkpoint_patterns = [
"**/checkpoint_*.pt",
"**/model_*.pt",
"**/*checkpoint*",
"**/epoch_*.pt"
]
for pattern in checkpoint_patterns:
for file_path in self.base_dir.rglob(pattern):
if "best_models" not in str(file_path) and file_path.is_file():
try:
file_time = datetime.fromtimestamp(file_path.stat().st_mtime)
if file_time < cutoff_date:
size_mb = file_path.stat().st_size / 1024 / 1024
file_path.unlink()
cleanup_summary['deleted_files'] += 1
cleanup_summary['freed_space_mb'] += size_mb
except Exception as e:
error_msg = f"Error deleting checkpoint {file_path}: {e}"
logger.error(error_msg)
cleanup_summary['errors'].append(error_msg)
if cleanup_summary['deleted_files'] > 0:
logger.info(f"Checkpoint cleanup: Deleted {cleanup_summary['deleted_files']} files, "
f"freed {cleanup_summary['freed_space_mb']:.1f}MB")
return cleanup_summary
def create_model_manager() -> ModelManager:
"""Create and initialize the global model manager"""
return ModelManager()
# Example usage
if __name__ == "__main__":
# Configure logging
logging.basicConfig(level=logging.INFO)
# Create model manager
manager = ModelManager()
# Clean up all existing models (with confirmation)
print("WARNING: This will delete ALL existing models!")
print("Type 'CONFIRM' to proceed:")
user_input = input().strip()
if user_input == "CONFIRM":
cleanup_result = manager.cleanup_all_existing_models(confirm=True)
print(f"\nCleanup complete:")
print(f"- Deleted {cleanup_result['files_deleted']} files")
print(f"- Freed {cleanup_result['space_freed_mb']:.1f}MB of space")
print(f"- Cleaned {cleanup_result['directories_cleaned']} directories")
if cleanup_result['errors']:
print(f"- {len(cleanup_result['errors'])} errors occurred")
else:
print("Cleanup cancelled")