From 11718bf92f19d6e83b40d98fc90c49de07495981 Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Wed, 2 Jul 2025 03:29:38 +0300 Subject: [PATCH] loss /performance display --- NN/models/saved/checkpoint_metadata.json | 50 ++++++++-------- utils/checkpoint_manager.py | 75 +++++++++++++++++++----- web/clean_dashboard.py | 51 +++++++++++++--- 3 files changed, 128 insertions(+), 48 deletions(-) diff --git a/NN/models/saved/checkpoint_metadata.json b/NN/models/saved/checkpoint_metadata.json index c053ddb..a6bea55 100644 --- a/NN/models/saved/checkpoint_metadata.json +++ b/NN/models/saved/checkpoint_metadata.json @@ -271,15 +271,15 @@ ], "decision": [ { - "checkpoint_id": "decision_20250702_020007", + "checkpoint_id": "decision_20250702_031158", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", - "created_at": "2025-07-02T02:00:07.439094", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt", + "created_at": "2025-07-02T03:11:58.134610", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999997759969705, + "performance_score": 101.79191339107027, "accuracy": null, - "loss": 2.240030294586859e-06, + "loss": 8.087262915050057e-05, "val_accuracy": null, "val_loss": null, "reward": null, @@ -291,15 +291,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_020007", + "checkpoint_id": "decision_20250702_031158", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", - "created_at": "2025-07-02T02:00:07.707012", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt", + "created_at": "2025-07-02T03:11:58.418736", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999997758801166, + "performance_score": 101.78996565336683, "accuracy": null, - "loss": 2.2411988334327916e-06, + "loss": 0.00010035353615320573, "val_accuracy": null, "val_loss": null, "reward": null, @@ -311,15 +311,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_020007", + "checkpoint_id": "decision_20250702_031157", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", - "created_at": "2025-07-02T02:00:07.570949", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt", + "created_at": "2025-07-02T03:11:57.126366", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999997757764104, + "performance_score": 101.78856752244535, "accuracy": null, - "loss": 2.2422358958193754e-06, + "loss": 0.00011433784719530295, "val_accuracy": null, "val_loss": null, "reward": null, @@ -331,15 +331,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_020007", + "checkpoint_id": "decision_20250702_031157", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", - "created_at": "2025-07-02T02:00:07.867047", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt", + "created_at": "2025-07-02T03:11:57.884663", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999997757753505, + "performance_score": 101.78849664377086, "accuracy": null, - "loss": 2.2422464945511442e-06, + "loss": 0.00011504679653424116, "val_accuracy": null, "val_loss": null, "reward": null, @@ -351,15 +351,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_020007", + "checkpoint_id": "decision_20250702_031156", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", - "created_at": "2025-07-02T02:00:07.302999", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_031156.pt", + "created_at": "2025-07-02T03:11:56.934135", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999997754320662, + "performance_score": 101.7830878040414, "accuracy": null, - "loss": 2.245679338091438e-06, + "loss": 0.00016915056666120008, "val_accuracy": null, "val_loss": null, "reward": null, diff --git a/utils/checkpoint_manager.py b/utils/checkpoint_manager.py index 499d572..2552bb6 100644 --- a/utils/checkpoint_manager.py +++ b/utils/checkpoint_manager.py @@ -12,6 +12,7 @@ from typing import Dict, List, Optional, Tuple, Any from dataclasses import dataclass, asdict from collections import defaultdict import torch +import random try: import wandb @@ -150,36 +151,80 @@ class CheckpointManager: return None def _calculate_performance_score(self, metrics: Dict[str, float]) -> float: + """Calculate performance score with improved sensitivity for training models""" score = 0.0 - if 'accuracy' in metrics: - score += metrics['accuracy'] * 100 - if 'val_accuracy' in metrics: - score += metrics['val_accuracy'] * 100 + # Prioritize loss reduction for active training models if 'loss' in metrics: - score += max(0, 10 - metrics['loss']) - if 'val_loss' in metrics: - score += max(0, 10 - metrics['val_loss']) - if 'reward' in metrics: - score += metrics['reward'] - if 'pnl' in metrics: - score += metrics['pnl'] + # Invert loss so lower loss = higher score, with better scaling + loss_value = metrics['loss'] + if loss_value > 0: + score += max(0, 100 / (1 + loss_value)) # More sensitive to loss changes + else: + score += 100 # Perfect loss + # Add other metrics with appropriate weights + if 'accuracy' in metrics: + score += metrics['accuracy'] * 50 # Reduced weight to balance with loss + if 'val_accuracy' in metrics: + score += metrics['val_accuracy'] * 50 + if 'val_loss' in metrics: + val_loss = metrics['val_loss'] + if val_loss > 0: + score += max(0, 50 / (1 + val_loss)) + if 'reward' in metrics: + score += metrics['reward'] * 10 + if 'pnl' in metrics: + score += metrics['pnl'] * 5 + if 'training_samples' in metrics: + # Bonus for processing more training samples + score += min(10, metrics['training_samples'] / 10) + + # Ensure minimum score for any training activity if score == 0.0 and metrics: + # Use the first available metric with better scaling first_metric = next(iter(metrics.values())) - score = first_metric if first_metric > 0 else 0.1 + if first_metric > 0: + score = max(0.1, min(10, first_metric)) + else: + score = 0.1 return max(score, 0.1) def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool: + """Improved checkpoint saving logic with more frequent saves during training""" if model_name not in self.checkpoints or not self.checkpoints[model_name]: - return True + return True # Always save first checkpoint + # Allow more checkpoints during active training if len(self.checkpoints[model_name]) < self.max_checkpoints: return True - worst_score = min(cp.performance_score for cp in self.checkpoints[model_name]) - return performance_score > worst_score + # Get current best and worst scores + scores = [cp.performance_score for cp in self.checkpoints[model_name]] + best_score = max(scores) + worst_score = min(scores) + + # Save if better than worst (more frequent saves) + if performance_score > worst_score: + return True + + # For high-performing models (score > 100), be more sensitive to small improvements + if best_score > 100: + # Save if within 0.1% of best score (very sensitive for converged models) + if performance_score >= best_score * 0.999: + return True + else: + # Also save if we're within 10% of best score (capture near-optimal models) + if performance_score >= best_score * 0.9: + return True + + # Save more frequently during active training (every 5th attempt instead of 10th) + if random.random() < 0.2: # 20% chance to save anyway + logger.info(f"Saving checkpoint for {model_name} - periodic save during active training") + return True + + return False def _save_model_file(self, model, file_path: Path, model_type: str) -> bool: try: diff --git a/web/clean_dashboard.py b/web/clean_dashboard.py index f46cbe4..0123d1c 100644 --- a/web/clean_dashboard.py +++ b/web/clean_dashboard.py @@ -3981,19 +3981,45 @@ class CleanTradingDashboard: async def _on_trading_decision(self, decision): """Handle trading decision from orchestrator.""" try: - action = getattr(decision, 'action', decision.get('action')) + # Handle both object and dict formats + if hasattr(decision, 'action'): + action = getattr(decision, 'action', 'HOLD') + symbol = getattr(decision, 'symbol', 'ETH/USDT') + confidence = getattr(decision, 'confidence', 0.0) + else: + action = decision.get('action', 'HOLD') + symbol = decision.get('symbol', 'ETH/USDT') + confidence = decision.get('confidence', 0.0) + if action == 'HOLD': return - symbol = getattr(decision, 'symbol', decision.get('symbol', 'ETH/USDT')) + if 'ETH' not in symbol.upper(): return - dashboard_decision = asdict(decision) if not isinstance(decision, dict) else decision.copy() - dashboard_decision['timestamp'] = datetime.now() - dashboard_decision['executed'] = False + + # Convert to dict format for dashboard storage + if hasattr(decision, '__dict__'): + dashboard_decision = { + 'action': action, + 'symbol': symbol, + 'confidence': confidence, + 'timestamp': datetime.now(), + 'executed': False + } + # Add any other attributes from the decision object + for attr in ['price', 'quantity', 'reasoning', 'model_source']: + if hasattr(decision, attr): + dashboard_decision[attr] = getattr(decision, attr) + else: + dashboard_decision = decision.copy() + dashboard_decision['timestamp'] = datetime.now() + dashboard_decision['executed'] = False + self.recent_decisions.append(dashboard_decision) if len(self.recent_decisions) > 200: self.recent_decisions.pop(0) - logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol}") + + logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol} (confidence: {confidence:.3f})") except Exception as e: logger.error(f"Error handling trading decision: {e}") @@ -4523,6 +4549,10 @@ class CleanTradingDashboard: network.train() action_logits, predicted_confidence = network(features_tensor) + # Ensure predicted_confidence has a batch dimension if it doesn't already + if predicted_confidence.dim() == 0: + predicted_confidence = predicted_confidence.unsqueeze(0) + # Calculate losses action_loss = nn.CrossEntropyLoss()(action_logits, action_target_tensor) confidence_loss = nn.MSELoss()(predicted_confidence, confidence_target_tensor) @@ -4559,7 +4589,10 @@ class CleanTradingDashboard: performance_metrics = { 'loss': avg_loss, 'training_samples': training_samples, - 'model_parameters': sum(p.numel() for p in network.parameters()) + 'model_parameters': sum(p.numel() for p in network.parameters()), + 'loss_improvement': 1.0 / (1.0 + avg_loss), # Higher is better + 'training_iterations': loss_count, + 'average_confidence': confidence_target if 'confidence_target' in locals() else 0.5 } metadata = save_checkpoint( @@ -4577,7 +4610,9 @@ class CleanTradingDashboard: logger.error(f"Error saving decision fusion checkpoint: {e}") if training_samples > 0: - logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples") + avg_loss_info = f", avg_loss={total_loss/loss_count:.6f}" if loss_count > 0 else "" + performance_score = 100 / (1 + (total_loss/loss_count)) if loss_count > 0 else 0.1 + logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples{avg_loss_info}, perf_score={performance_score:.4f}") except Exception as e: logger.error(f"Error in real decision fusion training: {e}")