loss /performance display

This commit is contained in:
Dobromir Popov
2025-07-02 03:29:38 +03:00
parent 29e4076638
commit 11718bf92f
3 changed files with 128 additions and 48 deletions

View File

@ -271,15 +271,15 @@
], ],
"decision": [ "decision": [
{ {
"checkpoint_id": "decision_20250702_020007", "checkpoint_id": "decision_20250702_031158",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt",
"created_at": "2025-07-02T02:00:07.439094", "created_at": "2025-07-02T03:11:58.134610",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999997759969705, "performance_score": 101.79191339107027,
"accuracy": null, "accuracy": null,
"loss": 2.240030294586859e-06, "loss": 8.087262915050057e-05,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -291,15 +291,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_020007", "checkpoint_id": "decision_20250702_031158",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt",
"created_at": "2025-07-02T02:00:07.707012", "created_at": "2025-07-02T03:11:58.418736",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999997758801166, "performance_score": 101.78996565336683,
"accuracy": null, "accuracy": null,
"loss": 2.2411988334327916e-06, "loss": 0.00010035353615320573,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -311,15 +311,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_020007", "checkpoint_id": "decision_20250702_031157",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt",
"created_at": "2025-07-02T02:00:07.570949", "created_at": "2025-07-02T03:11:57.126366",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999997757764104, "performance_score": 101.78856752244535,
"accuracy": null, "accuracy": null,
"loss": 2.2422358958193754e-06, "loss": 0.00011433784719530295,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -331,15 +331,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_020007", "checkpoint_id": "decision_20250702_031157",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt",
"created_at": "2025-07-02T02:00:07.867047", "created_at": "2025-07-02T03:11:57.884663",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999997757753505, "performance_score": 101.78849664377086,
"accuracy": null, "accuracy": null,
"loss": 2.2422464945511442e-06, "loss": 0.00011504679653424116,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -351,15 +351,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_020007", "checkpoint_id": "decision_20250702_031156",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_031156.pt",
"created_at": "2025-07-02T02:00:07.302999", "created_at": "2025-07-02T03:11:56.934135",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999997754320662, "performance_score": 101.7830878040414,
"accuracy": null, "accuracy": null,
"loss": 2.245679338091438e-06, "loss": 0.00016915056666120008,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,

View File

@ -12,6 +12,7 @@ from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
from collections import defaultdict from collections import defaultdict
import torch import torch
import random
try: try:
import wandb import wandb
@ -150,36 +151,80 @@ class CheckpointManager:
return None return None
def _calculate_performance_score(self, metrics: Dict[str, float]) -> float: def _calculate_performance_score(self, metrics: Dict[str, float]) -> float:
"""Calculate performance score with improved sensitivity for training models"""
score = 0.0 score = 0.0
if 'accuracy' in metrics: # Prioritize loss reduction for active training models
score += metrics['accuracy'] * 100
if 'val_accuracy' in metrics:
score += metrics['val_accuracy'] * 100
if 'loss' in metrics: if 'loss' in metrics:
score += max(0, 10 - metrics['loss']) # Invert loss so lower loss = higher score, with better scaling
if 'val_loss' in metrics: loss_value = metrics['loss']
score += max(0, 10 - metrics['val_loss']) if loss_value > 0:
if 'reward' in metrics: score += max(0, 100 / (1 + loss_value)) # More sensitive to loss changes
score += metrics['reward'] else:
if 'pnl' in metrics: score += 100 # Perfect loss
score += metrics['pnl']
# Add other metrics with appropriate weights
if 'accuracy' in metrics:
score += metrics['accuracy'] * 50 # Reduced weight to balance with loss
if 'val_accuracy' in metrics:
score += metrics['val_accuracy'] * 50
if 'val_loss' in metrics:
val_loss = metrics['val_loss']
if val_loss > 0:
score += max(0, 50 / (1 + val_loss))
if 'reward' in metrics:
score += metrics['reward'] * 10
if 'pnl' in metrics:
score += metrics['pnl'] * 5
if 'training_samples' in metrics:
# Bonus for processing more training samples
score += min(10, metrics['training_samples'] / 10)
# Ensure minimum score for any training activity
if score == 0.0 and metrics: if score == 0.0 and metrics:
# Use the first available metric with better scaling
first_metric = next(iter(metrics.values())) first_metric = next(iter(metrics.values()))
score = first_metric if first_metric > 0 else 0.1 if first_metric > 0:
score = max(0.1, min(10, first_metric))
else:
score = 0.1
return max(score, 0.1) return max(score, 0.1)
def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool: def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool:
"""Improved checkpoint saving logic with more frequent saves during training"""
if model_name not in self.checkpoints or not self.checkpoints[model_name]: if model_name not in self.checkpoints or not self.checkpoints[model_name]:
return True return True # Always save first checkpoint
# Allow more checkpoints during active training
if len(self.checkpoints[model_name]) < self.max_checkpoints: if len(self.checkpoints[model_name]) < self.max_checkpoints:
return True return True
worst_score = min(cp.performance_score for cp in self.checkpoints[model_name]) # Get current best and worst scores
return performance_score > worst_score scores = [cp.performance_score for cp in self.checkpoints[model_name]]
best_score = max(scores)
worst_score = min(scores)
# Save if better than worst (more frequent saves)
if performance_score > worst_score:
return True
# For high-performing models (score > 100), be more sensitive to small improvements
if best_score > 100:
# Save if within 0.1% of best score (very sensitive for converged models)
if performance_score >= best_score * 0.999:
return True
else:
# Also save if we're within 10% of best score (capture near-optimal models)
if performance_score >= best_score * 0.9:
return True
# Save more frequently during active training (every 5th attempt instead of 10th)
if random.random() < 0.2: # 20% chance to save anyway
logger.info(f"Saving checkpoint for {model_name} - periodic save during active training")
return True
return False
def _save_model_file(self, model, file_path: Path, model_type: str) -> bool: def _save_model_file(self, model, file_path: Path, model_type: str) -> bool:
try: try:

View File

@ -3981,19 +3981,45 @@ class CleanTradingDashboard:
async def _on_trading_decision(self, decision): async def _on_trading_decision(self, decision):
"""Handle trading decision from orchestrator.""" """Handle trading decision from orchestrator."""
try: try:
action = getattr(decision, 'action', decision.get('action')) # Handle both object and dict formats
if hasattr(decision, 'action'):
action = getattr(decision, 'action', 'HOLD')
symbol = getattr(decision, 'symbol', 'ETH/USDT')
confidence = getattr(decision, 'confidence', 0.0)
else:
action = decision.get('action', 'HOLD')
symbol = decision.get('symbol', 'ETH/USDT')
confidence = decision.get('confidence', 0.0)
if action == 'HOLD': if action == 'HOLD':
return return
symbol = getattr(decision, 'symbol', decision.get('symbol', 'ETH/USDT'))
if 'ETH' not in symbol.upper(): if 'ETH' not in symbol.upper():
return return
dashboard_decision = asdict(decision) if not isinstance(decision, dict) else decision.copy()
# Convert to dict format for dashboard storage
if hasattr(decision, '__dict__'):
dashboard_decision = {
'action': action,
'symbol': symbol,
'confidence': confidence,
'timestamp': datetime.now(),
'executed': False
}
# Add any other attributes from the decision object
for attr in ['price', 'quantity', 'reasoning', 'model_source']:
if hasattr(decision, attr):
dashboard_decision[attr] = getattr(decision, attr)
else:
dashboard_decision = decision.copy()
dashboard_decision['timestamp'] = datetime.now() dashboard_decision['timestamp'] = datetime.now()
dashboard_decision['executed'] = False dashboard_decision['executed'] = False
self.recent_decisions.append(dashboard_decision) self.recent_decisions.append(dashboard_decision)
if len(self.recent_decisions) > 200: if len(self.recent_decisions) > 200:
self.recent_decisions.pop(0) self.recent_decisions.pop(0)
logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol}")
logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol} (confidence: {confidence:.3f})")
except Exception as e: except Exception as e:
logger.error(f"Error handling trading decision: {e}") logger.error(f"Error handling trading decision: {e}")
@ -4523,6 +4549,10 @@ class CleanTradingDashboard:
network.train() network.train()
action_logits, predicted_confidence = network(features_tensor) action_logits, predicted_confidence = network(features_tensor)
# Ensure predicted_confidence has a batch dimension if it doesn't already
if predicted_confidence.dim() == 0:
predicted_confidence = predicted_confidence.unsqueeze(0)
# Calculate losses # Calculate losses
action_loss = nn.CrossEntropyLoss()(action_logits, action_target_tensor) action_loss = nn.CrossEntropyLoss()(action_logits, action_target_tensor)
confidence_loss = nn.MSELoss()(predicted_confidence, confidence_target_tensor) confidence_loss = nn.MSELoss()(predicted_confidence, confidence_target_tensor)
@ -4559,7 +4589,10 @@ class CleanTradingDashboard:
performance_metrics = { performance_metrics = {
'loss': avg_loss, 'loss': avg_loss,
'training_samples': training_samples, 'training_samples': training_samples,
'model_parameters': sum(p.numel() for p in network.parameters()) 'model_parameters': sum(p.numel() for p in network.parameters()),
'loss_improvement': 1.0 / (1.0 + avg_loss), # Higher is better
'training_iterations': loss_count,
'average_confidence': confidence_target if 'confidence_target' in locals() else 0.5
} }
metadata = save_checkpoint( metadata = save_checkpoint(
@ -4577,7 +4610,9 @@ class CleanTradingDashboard:
logger.error(f"Error saving decision fusion checkpoint: {e}") logger.error(f"Error saving decision fusion checkpoint: {e}")
if training_samples > 0: if training_samples > 0:
logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples") avg_loss_info = f", avg_loss={total_loss/loss_count:.6f}" if loss_count > 0 else ""
performance_score = 100 / (1 + (total_loss/loss_count)) if loss_count > 0 else 0.1
logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples{avg_loss_info}, perf_score={performance_score:.4f}")
except Exception as e: except Exception as e:
logger.error(f"Error in real decision fusion training: {e}") logger.error(f"Error in real decision fusion training: {e}")