measure models inference and train times
This commit is contained in:
@ -271,15 +271,15 @@
|
||||
],
|
||||
"decision": [
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_004145",
|
||||
"checkpoint_id": "decision_20250702_004715",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004145.pt",
|
||||
"created_at": "2025-07-02T00:41:45.478735",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004715.pt",
|
||||
"created_at": "2025-07-02T00:47:15.226637",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 8.93030759692192,
|
||||
"performance_score": 9.885439360547545,
|
||||
"accuracy": null,
|
||||
"loss": 1.0696924030780792,
|
||||
"loss": 0.1145606394524553,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -291,15 +291,75 @@
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_004245",
|
||||
"checkpoint_id": "decision_20250702_004715",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004245.pt",
|
||||
"created_at": "2025-07-02T00:42:45.982905",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004715.pt",
|
||||
"created_at": "2025-07-02T00:47:15.477601",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.178069523402623,
|
||||
"performance_score": 9.86977519926482,
|
||||
"accuracy": null,
|
||||
"loss": 0.8219304765973773,
|
||||
"loss": 0.13022480073517986,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_004714",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004714.pt",
|
||||
"created_at": "2025-07-02T00:47:14.411371",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.869006871279064,
|
||||
"accuracy": null,
|
||||
"loss": 0.13099312872093702,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_004716",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004716.pt",
|
||||
"created_at": "2025-07-02T00:47:16.582136",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.86168809807194,
|
||||
"accuracy": null,
|
||||
"loss": 0.1383119019280587,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "decision_20250702_004716",
|
||||
"model_name": "decision",
|
||||
"model_type": "decision_fusion",
|
||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_004716.pt",
|
||||
"created_at": "2025-07-02T00:47:16.828698",
|
||||
"file_size_mb": 0.06720924377441406,
|
||||
"performance_score": 9.861469801648386,
|
||||
"accuracy": null,
|
||||
"loss": 0.13853019835161312,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
@ -351,6 +411,66 @@
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "cob_rl_20250702_004446",
|
||||
"model_name": "cob_rl",
|
||||
"model_type": "cob_rl",
|
||||
"file_path": "NN\\models\\saved\\cob_rl\\cob_rl_20250702_004446.pt",
|
||||
"created_at": "2025-07-02T00:44:46.656201",
|
||||
"file_size_mb": 0.001003265380859375,
|
||||
"performance_score": 9.644,
|
||||
"accuracy": null,
|
||||
"loss": 0.356,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "cob_rl_20250702_004617",
|
||||
"model_name": "cob_rl",
|
||||
"model_type": "cob_rl",
|
||||
"file_path": "NN\\models\\saved\\cob_rl\\cob_rl_20250702_004617.pt",
|
||||
"created_at": "2025-07-02T00:46:17.380509",
|
||||
"file_size_mb": 0.001003265380859375,
|
||||
"performance_score": 9.644,
|
||||
"accuracy": null,
|
||||
"loss": 0.356,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "cob_rl_20250702_004712",
|
||||
"model_name": "cob_rl",
|
||||
"model_type": "cob_rl",
|
||||
"file_path": "NN\\models\\saved\\cob_rl\\cob_rl_20250702_004712.pt",
|
||||
"created_at": "2025-07-02T00:47:12.447176",
|
||||
"file_size_mb": 0.001003265380859375,
|
||||
"performance_score": 9.644,
|
||||
"accuracy": null,
|
||||
"loss": 0.356,
|
||||
"val_accuracy": null,
|
||||
"val_loss": null,
|
||||
"reward": null,
|
||||
"pnl": null,
|
||||
"epoch": null,
|
||||
"training_time_hours": null,
|
||||
"total_parameters": null,
|
||||
"wandb_run_id": null,
|
||||
"wandb_artifact_name": null
|
||||
}
|
||||
]
|
||||
}
|
@ -1987,13 +1987,15 @@ class CleanTradingDashboard:
|
||||
'created_at': dqn_state.get('created_at', 'Unknown'),
|
||||
'performance_score': dqn_state.get('performance_score', 0.0)
|
||||
},
|
||||
# NEW: Timing information
|
||||
'timing': {
|
||||
'last_inference': dqn_timing['last_inference'].strftime('%H:%M:%S') if dqn_timing['last_inference'] else 'None',
|
||||
'last_training': dqn_timing['last_training'].strftime('%H:%M:%S') if dqn_timing['last_training'] else 'None',
|
||||
'inferences_per_second': f"{dqn_timing['inferences_per_second']:.2f}",
|
||||
'predictions_24h': dqn_timing['prediction_count_24h']
|
||||
}
|
||||
# NEW: Timing information
|
||||
'timing': {
|
||||
'last_inference': dqn_timing['last_inference'].strftime('%H:%M:%S') if dqn_timing['last_inference'] else 'None',
|
||||
'last_training': dqn_timing['last_training'].strftime('%H:%M:%S') if dqn_timing['last_training'] else 'None',
|
||||
'inferences_per_second': f"{dqn_timing['inferences_per_second']:.2f}",
|
||||
'predictions_24h': dqn_timing['prediction_count_24h']
|
||||
},
|
||||
# NEW: Performance metrics for split-second decisions
|
||||
'performance': self.get_model_performance_metrics().get('dqn', {})
|
||||
}
|
||||
loaded_models['dqn'] = dqn_model_info
|
||||
|
||||
@ -2034,7 +2036,9 @@ class CleanTradingDashboard:
|
||||
'last_training': cnn_timing['last_training'].strftime('%H:%M:%S') if cnn_timing['last_training'] else 'None',
|
||||
'inferences_per_second': f"{cnn_timing['inferences_per_second']:.2f}",
|
||||
'predictions_24h': cnn_timing['prediction_count_24h']
|
||||
}
|
||||
},
|
||||
# NEW: Performance metrics for split-second decisions
|
||||
'performance': self.get_model_performance_metrics().get('cnn', {})
|
||||
}
|
||||
loaded_models['cnn'] = cnn_model_info
|
||||
|
||||
@ -2070,7 +2074,9 @@ class CleanTradingDashboard:
|
||||
'last_training': cob_timing['last_training'].strftime('%H:%M:%S') if cob_timing['last_training'] else 'None',
|
||||
'inferences_per_second': f"{cob_timing['inferences_per_second']:.2f}",
|
||||
'predictions_24h': cob_timing['prediction_count_24h']
|
||||
}
|
||||
},
|
||||
# NEW: Performance metrics for split-second decisions
|
||||
'performance': self.get_model_performance_metrics().get('cob_rl', {})
|
||||
}
|
||||
loaded_models['cob_rl'] = cob_model_info
|
||||
|
||||
@ -2111,7 +2117,9 @@ class CleanTradingDashboard:
|
||||
'last_training': decision_timing['last_training'].strftime('%H:%M:%S') if decision_timing['last_training'] else 'None',
|
||||
'inferences_per_second': f"{decision_timing['inferences_per_second']:.2f}",
|
||||
'predictions_24h': decision_timing['prediction_count_24h']
|
||||
}
|
||||
},
|
||||
# NEW: Performance metrics for split-second decisions
|
||||
'performance': self.get_model_performance_metrics().get('decision', {})
|
||||
}
|
||||
loaded_models['decision'] = decision_model_info
|
||||
|
||||
@ -3993,38 +4001,92 @@ class CleanTradingDashboard:
|
||||
"""Start real training system with data collection and actual model training"""
|
||||
try:
|
||||
def training_coordinator():
|
||||
logger.info("TRAINING: Real training coordinator started")
|
||||
logger.info("TRAINING: High-frequency training coordinator started")
|
||||
training_iteration = 0
|
||||
last_dqn_training = 0
|
||||
last_cnn_training = 0
|
||||
last_decision_training = 0
|
||||
last_cob_rl_training = 0
|
||||
|
||||
# Performance tracking
|
||||
self.training_performance = {
|
||||
'decision': {'inference_times': [], 'training_times': [], 'total_calls': 0},
|
||||
'cob_rl': {'inference_times': [], 'training_times': [], 'total_calls': 0},
|
||||
'dqn': {'inference_times': [], 'training_times': [], 'total_calls': 0},
|
||||
'cnn': {'inference_times': [], 'training_times': [], 'total_calls': 0}
|
||||
}
|
||||
|
||||
while True:
|
||||
try:
|
||||
training_iteration += 1
|
||||
current_time = time.time()
|
||||
market_data = self._collect_training_data()
|
||||
|
||||
if market_data:
|
||||
logger.debug(f"TRAINING: Collected {len(market_data)} market data points for training")
|
||||
if current_time - last_dqn_training > 30:
|
||||
self._perform_real_dqn_training(market_data)
|
||||
last_dqn_training = current_time
|
||||
if current_time - last_cnn_training > 45:
|
||||
self._perform_real_cnn_training(market_data)
|
||||
last_cnn_training = current_time
|
||||
if current_time - last_decision_training > 60:
|
||||
|
||||
# High-frequency training for split-second decisions
|
||||
# Train decision fusion and COB RL as fast as hardware allows
|
||||
if current_time - last_decision_training > 0.1: # Every 100ms
|
||||
start_time = time.time()
|
||||
self._perform_real_decision_training(market_data)
|
||||
training_time = time.time() - start_time
|
||||
self.training_performance['decision']['training_times'].append(training_time)
|
||||
self.training_performance['decision']['total_calls'] += 1
|
||||
last_decision_training = current_time
|
||||
if current_time - last_cob_rl_training > 90:
|
||||
|
||||
# Keep only last 100 measurements
|
||||
if len(self.training_performance['decision']['training_times']) > 100:
|
||||
self.training_performance['decision']['training_times'] = self.training_performance['decision']['training_times'][-100:]
|
||||
|
||||
if current_time - last_cob_rl_training > 0.1: # Every 100ms
|
||||
start_time = time.time()
|
||||
self._perform_real_cob_rl_training(market_data)
|
||||
training_time = time.time() - start_time
|
||||
self.training_performance['cob_rl']['training_times'].append(training_time)
|
||||
self.training_performance['cob_rl']['total_calls'] += 1
|
||||
last_cob_rl_training = current_time
|
||||
|
||||
# Keep only last 100 measurements
|
||||
if len(self.training_performance['cob_rl']['training_times']) > 100:
|
||||
self.training_performance['cob_rl']['training_times'] = self.training_performance['cob_rl']['training_times'][-100:]
|
||||
|
||||
# Standard frequency for larger models
|
||||
if current_time - last_dqn_training > 30:
|
||||
start_time = time.time()
|
||||
self._perform_real_dqn_training(market_data)
|
||||
training_time = time.time() - start_time
|
||||
self.training_performance['dqn']['training_times'].append(training_time)
|
||||
self.training_performance['dqn']['total_calls'] += 1
|
||||
last_dqn_training = current_time
|
||||
|
||||
if len(self.training_performance['dqn']['training_times']) > 50:
|
||||
self.training_performance['dqn']['training_times'] = self.training_performance['dqn']['training_times'][-50:]
|
||||
|
||||
if current_time - last_cnn_training > 45:
|
||||
start_time = time.time()
|
||||
self._perform_real_cnn_training(market_data)
|
||||
training_time = time.time() - start_time
|
||||
self.training_performance['cnn']['training_times'].append(training_time)
|
||||
self.training_performance['cnn']['total_calls'] += 1
|
||||
last_cnn_training = current_time
|
||||
|
||||
if len(self.training_performance['cnn']['training_times']) > 50:
|
||||
self.training_performance['cnn']['training_times'] = self.training_performance['cnn']['training_times'][-50:]
|
||||
|
||||
self._update_training_progress(training_iteration)
|
||||
if training_iteration % 10 == 0:
|
||||
logger.info(f"TRAINING: Iteration {training_iteration} - DQN memory: {self._get_dqn_memory_size()}, CNN batches: {training_iteration // 10}")
|
||||
time.sleep(10)
|
||||
|
||||
# Log performance metrics every 100 iterations
|
||||
if training_iteration % 100 == 0:
|
||||
self._log_training_performance()
|
||||
logger.info(f"TRAINING: Iteration {training_iteration} - High-frequency training active")
|
||||
|
||||
# Minimal sleep for maximum responsiveness
|
||||
time.sleep(0.05) # 50ms sleep for 20Hz training loop
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TRAINING: Error in training iteration {training_iteration}: {e}")
|
||||
time.sleep(30)
|
||||
time.sleep(1) # Shorter error recovery
|
||||
training_thread = threading.Thread(target=training_coordinator, daemon=True)
|
||||
training_thread.start()
|
||||
logger.info("TRAINING: Real training system started successfully")
|
||||
@ -4422,6 +4484,60 @@ class CleanTradingDashboard:
|
||||
logger.info(f"Training progress: iteration {iteration}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating training progress: {e}")
|
||||
|
||||
def _log_training_performance(self):
|
||||
"""Log detailed training performance metrics"""
|
||||
try:
|
||||
if not hasattr(self, 'training_performance'):
|
||||
return
|
||||
|
||||
for model_name, metrics in self.training_performance.items():
|
||||
if metrics['training_times']:
|
||||
avg_training = sum(metrics['training_times']) / len(metrics['training_times'])
|
||||
max_training = max(metrics['training_times'])
|
||||
min_training = min(metrics['training_times'])
|
||||
|
||||
logger.info(f"PERFORMANCE {model_name.upper()}: "
|
||||
f"Avg={avg_training*1000:.1f}ms, "
|
||||
f"Min={min_training*1000:.1f}ms, "
|
||||
f"Max={max_training*1000:.1f}ms, "
|
||||
f"Calls={metrics['total_calls']}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging training performance: {e}")
|
||||
|
||||
def get_model_performance_metrics(self) -> Dict[str, Any]:
|
||||
"""Get detailed performance metrics for all models"""
|
||||
try:
|
||||
if not hasattr(self, 'training_performance'):
|
||||
return {}
|
||||
|
||||
performance_metrics = {}
|
||||
for model_name, metrics in self.training_performance.items():
|
||||
if metrics['training_times']:
|
||||
avg_training = sum(metrics['training_times']) / len(metrics['training_times'])
|
||||
max_training = max(metrics['training_times'])
|
||||
min_training = min(metrics['training_times'])
|
||||
|
||||
performance_metrics[model_name] = {
|
||||
'avg_training_time_ms': round(avg_training * 1000, 2),
|
||||
'max_training_time_ms': round(max_training * 1000, 2),
|
||||
'min_training_time_ms': round(min_training * 1000, 2),
|
||||
'total_calls': metrics['total_calls'],
|
||||
'training_frequency_hz': round(1.0 / avg_training if avg_training > 0 else 0, 1)
|
||||
}
|
||||
else:
|
||||
performance_metrics[model_name] = {
|
||||
'avg_training_time_ms': 0,
|
||||
'max_training_time_ms': 0,
|
||||
'min_training_time_ms': 0,
|
||||
'total_calls': 0,
|
||||
'training_frequency_hz': 0
|
||||
}
|
||||
|
||||
return performance_metrics
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting performance metrics: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def create_clean_dashboard(data_provider: Optional[DataProvider] = None, orchestrator: Optional[TradingOrchestrator] = None, trading_executor: Optional[TradingExecutor] = None):
|
||||
|
Reference in New Issue
Block a user