loss /performance display
This commit is contained in:
@ -271,15 +271,15 @@
|
||||
],
|
||||
"decision": [
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_020007",
|
||||
"checkpoint_id": "decision_20250702_031158",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt",
|
||||
"created_at": "2025-07-02T02:00:07.439094",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt",
|
||||
"created_at": "2025-07-02T03:11:58.134610",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.999997759969705,
|
||||
"performance_score": 101.79191339107027,
|
||||
"accuracy": null,
|
||||
"loss": 2.240030294586859e-06,
|
||||
"loss": 8.087262915050057e-05,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -291,15 +291,15 @@
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_020007",
|
||||
"checkpoint_id": "decision_20250702_031158",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt",
|
||||
"created_at": "2025-07-02T02:00:07.707012",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_031158.pt",
|
||||
"created_at": "2025-07-02T03:11:58.418736",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.999997758801166,
|
||||
"performance_score": 101.78996565336683,
|
||||
"accuracy": null,
|
||||
"loss": 2.2411988334327916e-06,
|
||||
"loss": 0.00010035353615320573,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -311,15 +311,15 @@
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_020007",
|
||||
"checkpoint_id": "decision_20250702_031157",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt",
|
||||
"created_at": "2025-07-02T02:00:07.570949",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt",
|
||||
"created_at": "2025-07-02T03:11:57.126366",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.999997757764104,
|
||||
"performance_score": 101.78856752244535,
|
||||
"accuracy": null,
|
||||
"loss": 2.2422358958193754e-06,
|
||||
"loss": 0.00011433784719530295,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -331,15 +331,15 @@
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_020007",
|
||||
"checkpoint_id": "decision_20250702_031157",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt",
|
||||
"created_at": "2025-07-02T02:00:07.867047",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_031157.pt",
|
||||
"created_at": "2025-07-02T03:11:57.884663",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.999997757753505,
|
||||
"performance_score": 101.78849664377086,
|
||||
"accuracy": null,
|
||||
"loss": 2.2422464945511442e-06,
|
||||
"loss": 0.00011504679653424116,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -351,15 +351,15 @@
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_020007",
|
||||
"checkpoint_id": "decision_20250702_031156",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_020007.pt",
|
||||
"created_at": "2025-07-02T02:00:07.302999",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_031156.pt",
|
||||
"created_at": "2025-07-02T03:11:56.934135",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.999997754320662,
|
||||
"performance_score": 101.7830878040414,
|
||||
"accuracy": null,
|
||||
"loss": 2.245679338091438e-06,
|
||||
"loss": 0.00016915056666120008,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
|
@ -12,6 +12,7 @@ from typing import Dict, List, Optional, Tuple, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
from collections import defaultdict
|
||||
import torch
|
||||
import random
|
||||
|
||||
try:
|
||||
import wandb
|
||||
@ -150,36 +151,80 @@ class CheckpointManager:
|
||||
return None
|
||||
|
||||
def _calculate_performance_score(self, metrics: Dict[str, float]) -> float:
|
||||
"""Calculate performance score with improved sensitivity for training models"""
|
||||
score = 0.0
|
||||
|
||||
if 'accuracy' in metrics:
|
||||
score += metrics['accuracy'] * 100
|
||||
if 'val_accuracy' in metrics:
|
||||
score += metrics['val_accuracy'] * 100
|
||||
# Prioritize loss reduction for active training models
|
||||
if 'loss' in metrics:
|
||||
score += max(0, 10 - metrics['loss'])
|
||||
if 'val_loss' in metrics:
|
||||
score += max(0, 10 - metrics['val_loss'])
|
||||
if 'reward' in metrics:
|
||||
score += metrics['reward']
|
||||
if 'pnl' in metrics:
|
||||
score += metrics['pnl']
|
||||
# Invert loss so lower loss = higher score, with better scaling
|
||||
loss_value = metrics['loss']
|
||||
if loss_value > 0:
|
||||
score += max(0, 100 / (1 + loss_value)) # More sensitive to loss changes
|
||||
else:
|
||||
score += 100 # Perfect loss
|
||||
|
||||
# Add other metrics with appropriate weights
|
||||
if 'accuracy' in metrics:
|
||||
score += metrics['accuracy'] * 50 # Reduced weight to balance with loss
|
||||
if 'val_accuracy' in metrics:
|
||||
score += metrics['val_accuracy'] * 50
|
||||
if 'val_loss' in metrics:
|
||||
val_loss = metrics['val_loss']
|
||||
if val_loss > 0:
|
||||
score += max(0, 50 / (1 + val_loss))
|
||||
if 'reward' in metrics:
|
||||
score += metrics['reward'] * 10
|
||||
if 'pnl' in metrics:
|
||||
score += metrics['pnl'] * 5
|
||||
if 'training_samples' in metrics:
|
||||
# Bonus for processing more training samples
|
||||
score += min(10, metrics['training_samples'] / 10)
|
||||
|
||||
# Ensure minimum score for any training activity
|
||||
if score == 0.0 and metrics:
|
||||
# Use the first available metric with better scaling
|
||||
first_metric = next(iter(metrics.values()))
|
||||
score = first_metric if first_metric > 0 else 0.1
|
||||
if first_metric > 0:
|
||||
score = max(0.1, min(10, first_metric))
|
||||
else:
|
||||
score = 0.1
|
||||
|
||||
return max(score, 0.1)
|
||||
|
||||
def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool:
|
||||
"""Improved checkpoint saving logic with more frequent saves during training"""
|
||||
if model_name not in self.checkpoints or not self.checkpoints[model_name]:
|
||||
return True
|
||||
return True # Always save first checkpoint
|
||||
|
||||
# Allow more checkpoints during active training
|
||||
if len(self.checkpoints[model_name]) < self.max_checkpoints:
|
||||
return True
|
||||
|
||||
worst_score = min(cp.performance_score for cp in self.checkpoints[model_name])
|
||||
return performance_score > worst_score
|
||||
# Get current best and worst scores
|
||||
scores = [cp.performance_score for cp in self.checkpoints[model_name]]
|
||||
best_score = max(scores)
|
||||
worst_score = min(scores)
|
||||
|
||||
# Save if better than worst (more frequent saves)
|
||||
if performance_score > worst_score:
|
||||
return True
|
||||
|
||||
# For high-performing models (score > 100), be more sensitive to small improvements
|
||||
if best_score > 100:
|
||||
# Save if within 0.1% of best score (very sensitive for converged models)
|
||||
if performance_score >= best_score * 0.999:
|
||||
return True
|
||||
else:
|
||||
# Also save if we're within 10% of best score (capture near-optimal models)
|
||||
if performance_score >= best_score * 0.9:
|
||||
return True
|
||||
|
||||
# Save more frequently during active training (every 5th attempt instead of 10th)
|
||||
if random.random() < 0.2: # 20% chance to save anyway
|
||||
logger.info(f"Saving checkpoint for {model_name} - periodic save during active training")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _save_model_file(self, model, file_path: Path, model_type: str) -> bool:
|
||||
try:
|
||||
|
@ -3981,19 +3981,45 @@ class CleanTradingDashboard:
|
||||
async def _on_trading_decision(self, decision):
|
||||
"""Handle trading decision from orchestrator."""
|
||||
try:
|
||||
action = getattr(decision, 'action', decision.get('action'))
|
||||
# Handle both object and dict formats
|
||||
if hasattr(decision, 'action'):
|
||||
action = getattr(decision, 'action', 'HOLD')
|
||||
symbol = getattr(decision, 'symbol', 'ETH/USDT')
|
||||
confidence = getattr(decision, 'confidence', 0.0)
|
||||
else:
|
||||
action = decision.get('action', 'HOLD')
|
||||
symbol = decision.get('symbol', 'ETH/USDT')
|
||||
confidence = decision.get('confidence', 0.0)
|
||||
|
||||
if action == 'HOLD':
|
||||
return
|
||||
symbol = getattr(decision, 'symbol', decision.get('symbol', 'ETH/USDT'))
|
||||
|
||||
if 'ETH' not in symbol.upper():
|
||||
return
|
||||
dashboard_decision = asdict(decision) if not isinstance(decision, dict) else decision.copy()
|
||||
|
||||
# Convert to dict format for dashboard storage
|
||||
if hasattr(decision, '__dict__'):
|
||||
dashboard_decision = {
|
||||
'action': action,
|
||||
'symbol': symbol,
|
||||
'confidence': confidence,
|
||||
'timestamp': datetime.now(),
|
||||
'executed': False
|
||||
}
|
||||
# Add any other attributes from the decision object
|
||||
for attr in ['price', 'quantity', 'reasoning', 'model_source']:
|
||||
if hasattr(decision, attr):
|
||||
dashboard_decision[attr] = getattr(decision, attr)
|
||||
else:
|
||||
dashboard_decision = decision.copy()
|
||||
dashboard_decision['timestamp'] = datetime.now()
|
||||
dashboard_decision['executed'] = False
|
||||
|
||||
self.recent_decisions.append(dashboard_decision)
|
||||
if len(self.recent_decisions) > 200:
|
||||
self.recent_decisions.pop(0)
|
||||
logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol}")
|
||||
|
||||
logger.info(f"[ORCHESTRATOR SIGNAL] Received: {action} for {symbol} (confidence: {confidence:.3f})")
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling trading decision: {e}")
|
||||
|
||||
@ -4523,6 +4549,10 @@ class CleanTradingDashboard:
|
||||
network.train()
|
||||
action_logits, predicted_confidence = network(features_tensor)
|
||||
|
||||
# Ensure predicted_confidence has a batch dimension if it doesn't already
|
||||
if predicted_confidence.dim() == 0:
|
||||
predicted_confidence = predicted_confidence.unsqueeze(0)
|
||||
|
||||
# Calculate losses
|
||||
action_loss = nn.CrossEntropyLoss()(action_logits, action_target_tensor)
|
||||
confidence_loss = nn.MSELoss()(predicted_confidence, confidence_target_tensor)
|
||||
@ -4559,7 +4589,10 @@ class CleanTradingDashboard:
|
||||
performance_metrics = {
|
||||
'loss': avg_loss,
|
||||
'training_samples': training_samples,
|
||||
'model_parameters': sum(p.numel() for p in network.parameters())
|
||||
'model_parameters': sum(p.numel() for p in network.parameters()),
|
||||
'loss_improvement': 1.0 / (1.0 + avg_loss), # Higher is better
|
||||
'training_iterations': loss_count,
|
||||
'average_confidence': confidence_target if 'confidence_target' in locals() else 0.5
|
||||
}
|
||||
|
||||
metadata = save_checkpoint(
|
||||
@ -4577,7 +4610,9 @@ class CleanTradingDashboard:
|
||||
logger.error(f"Error saving decision fusion checkpoint: {e}")
|
||||
|
||||
if training_samples > 0:
|
||||
logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples")
|
||||
avg_loss_info = f", avg_loss={total_loss/loss_count:.6f}" if loss_count > 0 else ""
|
||||
performance_score = 100 / (1 + (total_loss/loss_count)) if loss_count > 0 else 0.1
|
||||
logger.info(f"DECISION TRAINING: Processed {training_samples} decision fusion samples{avg_loss_info}, perf_score={performance_score:.4f}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in real decision fusion training: {e}")
|
||||
|
||||
|
Reference in New Issue
Block a user