measure models inference and train times

This commit is contained in:
Dobromir Popov
2025-07-02 00:47:18 +03:00
parent 3ad21582e0
commit c267657456
2 changed files with 269 additions and 33 deletions

View File

@ -1987,13 +1987,15 @@ class CleanTradingDashboard:
'created_at': dqn_state.get('created_at', 'Unknown'),
'performance_score': dqn_state.get('performance_score', 0.0)
},
# NEW: Timing information
'timing': {
'last_inference': dqn_timing['last_inference'].strftime('%H:%M:%S') if dqn_timing['last_inference'] else 'None',
'last_training': dqn_timing['last_training'].strftime('%H:%M:%S') if dqn_timing['last_training'] else 'None',
'inferences_per_second': f"{dqn_timing['inferences_per_second']:.2f}",
'predictions_24h': dqn_timing['prediction_count_24h']
}
# NEW: Timing information
'timing': {
'last_inference': dqn_timing['last_inference'].strftime('%H:%M:%S') if dqn_timing['last_inference'] else 'None',
'last_training': dqn_timing['last_training'].strftime('%H:%M:%S') if dqn_timing['last_training'] else 'None',
'inferences_per_second': f"{dqn_timing['inferences_per_second']:.2f}",
'predictions_24h': dqn_timing['prediction_count_24h']
},
# NEW: Performance metrics for split-second decisions
'performance': self.get_model_performance_metrics().get('dqn', {})
}
loaded_models['dqn'] = dqn_model_info
@ -2034,7 +2036,9 @@ class CleanTradingDashboard:
'last_training': cnn_timing['last_training'].strftime('%H:%M:%S') if cnn_timing['last_training'] else 'None',
'inferences_per_second': f"{cnn_timing['inferences_per_second']:.2f}",
'predictions_24h': cnn_timing['prediction_count_24h']
}
},
# NEW: Performance metrics for split-second decisions
'performance': self.get_model_performance_metrics().get('cnn', {})
}
loaded_models['cnn'] = cnn_model_info
@ -2070,7 +2074,9 @@ class CleanTradingDashboard:
'last_training': cob_timing['last_training'].strftime('%H:%M:%S') if cob_timing['last_training'] else 'None',
'inferences_per_second': f"{cob_timing['inferences_per_second']:.2f}",
'predictions_24h': cob_timing['prediction_count_24h']
}
},
# NEW: Performance metrics for split-second decisions
'performance': self.get_model_performance_metrics().get('cob_rl', {})
}
loaded_models['cob_rl'] = cob_model_info
@ -2111,7 +2117,9 @@ class CleanTradingDashboard:
'last_training': decision_timing['last_training'].strftime('%H:%M:%S') if decision_timing['last_training'] else 'None',
'inferences_per_second': f"{decision_timing['inferences_per_second']:.2f}",
'predictions_24h': decision_timing['prediction_count_24h']
}
},
# NEW: Performance metrics for split-second decisions
'performance': self.get_model_performance_metrics().get('decision', {})
}
loaded_models['decision'] = decision_model_info
@ -3993,38 +4001,92 @@ class CleanTradingDashboard:
"""Start real training system with data collection and actual model training"""
try:
def training_coordinator():
logger.info("TRAINING: Real training coordinator started")
logger.info("TRAINING: High-frequency training coordinator started")
training_iteration = 0
last_dqn_training = 0
last_cnn_training = 0
last_decision_training = 0
last_cob_rl_training = 0
# Performance tracking
self.training_performance = {
'decision': {'inference_times': [], 'training_times': [], 'total_calls': 0},
'cob_rl': {'inference_times': [], 'training_times': [], 'total_calls': 0},
'dqn': {'inference_times': [], 'training_times': [], 'total_calls': 0},
'cnn': {'inference_times': [], 'training_times': [], 'total_calls': 0}
}
while True:
try:
training_iteration += 1
current_time = time.time()
market_data = self._collect_training_data()
if market_data:
logger.debug(f"TRAINING: Collected {len(market_data)} market data points for training")
if current_time - last_dqn_training > 30:
self._perform_real_dqn_training(market_data)
last_dqn_training = current_time
if current_time - last_cnn_training > 45:
self._perform_real_cnn_training(market_data)
last_cnn_training = current_time
if current_time - last_decision_training > 60:
# High-frequency training for split-second decisions
# Train decision fusion and COB RL as fast as hardware allows
if current_time - last_decision_training > 0.1: # Every 100ms
start_time = time.time()
self._perform_real_decision_training(market_data)
training_time = time.time() - start_time
self.training_performance['decision']['training_times'].append(training_time)
self.training_performance['decision']['total_calls'] += 1
last_decision_training = current_time
if current_time - last_cob_rl_training > 90:
# Keep only last 100 measurements
if len(self.training_performance['decision']['training_times']) > 100:
self.training_performance['decision']['training_times'] = self.training_performance['decision']['training_times'][-100:]
if current_time - last_cob_rl_training > 0.1: # Every 100ms
start_time = time.time()
self._perform_real_cob_rl_training(market_data)
training_time = time.time() - start_time
self.training_performance['cob_rl']['training_times'].append(training_time)
self.training_performance['cob_rl']['total_calls'] += 1
last_cob_rl_training = current_time
# Keep only last 100 measurements
if len(self.training_performance['cob_rl']['training_times']) > 100:
self.training_performance['cob_rl']['training_times'] = self.training_performance['cob_rl']['training_times'][-100:]
# Standard frequency for larger models
if current_time - last_dqn_training > 30:
start_time = time.time()
self._perform_real_dqn_training(market_data)
training_time = time.time() - start_time
self.training_performance['dqn']['training_times'].append(training_time)
self.training_performance['dqn']['total_calls'] += 1
last_dqn_training = current_time
if len(self.training_performance['dqn']['training_times']) > 50:
self.training_performance['dqn']['training_times'] = self.training_performance['dqn']['training_times'][-50:]
if current_time - last_cnn_training > 45:
start_time = time.time()
self._perform_real_cnn_training(market_data)
training_time = time.time() - start_time
self.training_performance['cnn']['training_times'].append(training_time)
self.training_performance['cnn']['total_calls'] += 1
last_cnn_training = current_time
if len(self.training_performance['cnn']['training_times']) > 50:
self.training_performance['cnn']['training_times'] = self.training_performance['cnn']['training_times'][-50:]
self._update_training_progress(training_iteration)
if training_iteration % 10 == 0:
logger.info(f"TRAINING: Iteration {training_iteration} - DQN memory: {self._get_dqn_memory_size()}, CNN batches: {training_iteration // 10}")
time.sleep(10)
# Log performance metrics every 100 iterations
if training_iteration % 100 == 0:
self._log_training_performance()
logger.info(f"TRAINING: Iteration {training_iteration} - High-frequency training active")
# Minimal sleep for maximum responsiveness
time.sleep(0.05) # 50ms sleep for 20Hz training loop
except Exception as e:
logger.error(f"TRAINING: Error in training iteration {training_iteration}: {e}")
time.sleep(30)
time.sleep(1) # Shorter error recovery
training_thread = threading.Thread(target=training_coordinator, daemon=True)
training_thread.start()
logger.info("TRAINING: Real training system started successfully")
@ -4422,6 +4484,60 @@ class CleanTradingDashboard:
logger.info(f"Training progress: iteration {iteration}")
except Exception as e:
logger.error(f"Error updating training progress: {e}")
def _log_training_performance(self):
"""Log detailed training performance metrics"""
try:
if not hasattr(self, 'training_performance'):
return
for model_name, metrics in self.training_performance.items():
if metrics['training_times']:
avg_training = sum(metrics['training_times']) / len(metrics['training_times'])
max_training = max(metrics['training_times'])
min_training = min(metrics['training_times'])
logger.info(f"PERFORMANCE {model_name.upper()}: "
f"Avg={avg_training*1000:.1f}ms, "
f"Min={min_training*1000:.1f}ms, "
f"Max={max_training*1000:.1f}ms, "
f"Calls={metrics['total_calls']}")
except Exception as e:
logger.error(f"Error logging training performance: {e}")
def get_model_performance_metrics(self) -> Dict[str, Any]:
"""Get detailed performance metrics for all models"""
try:
if not hasattr(self, 'training_performance'):
return {}
performance_metrics = {}
for model_name, metrics in self.training_performance.items():
if metrics['training_times']:
avg_training = sum(metrics['training_times']) / len(metrics['training_times'])
max_training = max(metrics['training_times'])
min_training = min(metrics['training_times'])
performance_metrics[model_name] = {
'avg_training_time_ms': round(avg_training * 1000, 2),
'max_training_time_ms': round(max_training * 1000, 2),
'min_training_time_ms': round(min_training * 1000, 2),
'total_calls': metrics['total_calls'],
'training_frequency_hz': round(1.0 / avg_training if avg_training > 0 else 0, 1)
}
else:
performance_metrics[model_name] = {
'avg_training_time_ms': 0,
'max_training_time_ms': 0,
'min_training_time_ms': 0,
'total_calls': 0,
'training_frequency_hz': 0
}
return performance_metrics
except Exception as e:
logger.error(f"Error getting performance metrics: {e}")
return {}
def create_clean_dashboard(data_provider: Optional[DataProvider] = None, orchestrator: Optional[TradingOrchestrator] = None, trading_executor: Optional[TradingExecutor] = None):