loss /performance display

2025-07-02 03:29:38 +03:00
parent 29e4076638
commit 11718bf92f
3 changed files with 128 additions and 48 deletions
--- a/utils/checkpoint_manager.py
+++ b/utils/checkpoint_manager.py
@@ -12,6 +12,7 @@ from typing import Dict, List, Optional, Tuple, Any
 from dataclasses import dataclass, asdict
 from collections import defaultdict
 import torch
+import random

 try:
    import wandb
@@ -150,36 +151,80 @@ class CheckpointManager:
            return None
    
    def _calculate_performance_score(self, metrics: Dict[str, float]) -> float:
+        """Calculate performance score with improved sensitivity for training models"""
        score = 0.0
        
-        if 'accuracy' in metrics:
-            score += metrics['accuracy'] * 100
-        if 'val_accuracy' in metrics:
-            score += metrics['val_accuracy'] * 100
+        # Prioritize loss reduction for active training models
        if 'loss' in metrics:
-            score += max(0, 10 - metrics['loss'])
-        if 'val_loss' in metrics:
-            score += max(0, 10 - metrics['val_loss'])
-        if 'reward' in metrics:
-            score += metrics['reward']
-        if 'pnl' in metrics:
-            score += metrics['pnl']
+            # Invert loss so lower loss = higher score, with better scaling
+            loss_value = metrics['loss']
+            if loss_value > 0:
+                score += max(0, 100 / (1 + loss_value))  # More sensitive to loss changes
+            else:
+                score += 100  # Perfect loss
        
+        # Add other metrics with appropriate weights
+        if 'accuracy' in metrics:
+            score += metrics['accuracy'] * 50  # Reduced weight to balance with loss
+        if 'val_accuracy' in metrics:
+            score += metrics['val_accuracy'] * 50
+        if 'val_loss' in metrics:
+            val_loss = metrics['val_loss']
+            if val_loss > 0:
+                score += max(0, 50 / (1 + val_loss))
+        if 'reward' in metrics:
+            score += metrics['reward'] * 10
+        if 'pnl' in metrics:
+            score += metrics['pnl'] * 5
+        if 'training_samples' in metrics:
+            # Bonus for processing more training samples
+            score += min(10, metrics['training_samples'] / 10)
+        
+        # Ensure minimum score for any training activity
        if score == 0.0 and metrics:
+            # Use the first available metric with better scaling
            first_metric = next(iter(metrics.values()))
-            score = first_metric if first_metric > 0 else 0.1
+            if first_metric > 0:
+                score = max(0.1, min(10, first_metric))
+            else:
+                score = 0.1
        
        return max(score, 0.1)
    
    def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool:
+        """Improved checkpoint saving logic with more frequent saves during training"""
        if model_name not in self.checkpoints or not self.checkpoints[model_name]:
-            return True
+            return True  # Always save first checkpoint
        
+        # Allow more checkpoints during active training
        if len(self.checkpoints[model_name]) < self.max_checkpoints:
            return True
        
-        worst_score = min(cp.performance_score for cp in self.checkpoints[model_name])
-        return performance_score > worst_score
+        # Get current best and worst scores
+        scores = [cp.performance_score for cp in self.checkpoints[model_name]]
+        best_score = max(scores)
+        worst_score = min(scores)
+        
+        # Save if better than worst (more frequent saves)
+        if performance_score > worst_score:
+            return True
+        
+        # For high-performing models (score > 100), be more sensitive to small improvements
+        if best_score > 100:
+            # Save if within 0.1% of best score (very sensitive for converged models)
+            if performance_score >= best_score * 0.999:
+                return True
+        else:
+            # Also save if we're within 10% of best score (capture near-optimal models)
+            if performance_score >= best_score * 0.9:
+                return True
+        
+        # Save more frequently during active training (every 5th attempt instead of 10th)
+        if random.random() < 0.2:  # 20% chance to save anyway
+            logger.info(f"Saving checkpoint for {model_name} - periodic save during active training")
+            return True
+        
+        return False
    
    def _save_model_file(self, model, file_path: Path, model_type: str) -> bool:
        try: