diff --git a/ANNOTATE/LOGGING_CONFIGURATION.md b/ANNOTATE/LOGGING_CONFIGURATION.md new file mode 100644 index 0000000..8cca75b --- /dev/null +++ b/ANNOTATE/LOGGING_CONFIGURATION.md @@ -0,0 +1,184 @@ +# Logging Configuration + +## Issue: Excessive Werkzeug Logs + +### Problem +``` +2025-10-31 03:23:53,478 - werkzeug - INFO - 127.0.0.1 - - [31/Oct/2025 03:23:53] "POST /api/training-progress HTTP/1.1" 200 - +2025-10-31 03:23:55,519 - werkzeug - INFO - 127.0.0.1 - - [31/Oct/2025 03:23:55] "POST /api/training-progress HTTP/1.1" 200 - +2025-10-31 03:23:56,533 - werkzeug - INFO - 127.0.0.1 - - [31/Oct/2025 03:23:56] "POST /api/training-progress HTTP/1.1" 200 - +... +``` + +**Cause**: The frontend polls `/api/training-progress` every 1-2 seconds, and Flask's werkzeug logger logs every request at INFO level. + +--- + +## Solution + +### Fixed in `ANNOTATE/web/app.py` + +```python +# Initialize Flask app +self.server = Flask( + __name__, + template_folder='templates', + static_folder='static' +) + +# Suppress werkzeug request logs (reduce noise from polling endpoints) +werkzeug_logger = logging.getLogger('werkzeug') +werkzeug_logger.setLevel(logging.WARNING) # Only show warnings and errors, not INFO +``` + +**Result**: Werkzeug will now only log warnings and errors, not every request. + +--- + +## Logging Levels + +### Before (Noisy) +``` +INFO - Every request logged +INFO - GET /api/chart-data +INFO - POST /api/training-progress +INFO - GET /static/css/style.css +... (hundreds of lines per minute) +``` + +### After (Clean) +``` +WARNING - Only important events +ERROR - Only errors +... (quiet unless something is wrong) +``` + +--- + +## Customization + +### Show Only Errors +```python +werkzeug_logger.setLevel(logging.ERROR) # Only errors +``` + +### Show All Requests (Debug Mode) +```python +werkzeug_logger.setLevel(logging.INFO) # All requests (default) +``` + +### Selective Filtering +```python +# Custom filter to exclude specific endpoints +class ExcludeEndpointFilter(logging.Filter): + def filter(self, record): + # Exclude training-progress endpoint + return '/api/training-progress' not in record.getMessage() + +werkzeug_logger.addFilter(ExcludeEndpointFilter()) +``` + +--- + +## Other Loggers + +### Application Logger +```python +# Your application logs (keep at INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +``` + +### Third-Party Libraries +```python +# Suppress noisy third-party loggers +logging.getLogger('urllib3').setLevel(logging.WARNING) +logging.getLogger('requests').setLevel(logging.WARNING) +logging.getLogger('matplotlib').setLevel(logging.WARNING) +``` + +--- + +## Log File Configuration + +### Current Setup +```python +log_file = Path(__file__).parent.parent / 'logs' / f'annotate_{datetime.now().strftime("%Y%m%d")}.log' + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file), + logging.StreamHandler(sys.stdout) + ] +) +``` + +### Recommended: Separate Log Files +```python +# Application logs +app_log = 'logs/annotate_app.log' +app_handler = logging.FileHandler(app_log) +app_handler.setLevel(logging.INFO) + +# Request logs (if needed) +request_log = 'logs/annotate_requests.log' +request_handler = logging.FileHandler(request_log) +request_handler.setLevel(logging.DEBUG) + +# Configure werkzeug to use separate file +werkzeug_logger = logging.getLogger('werkzeug') +werkzeug_logger.addHandler(request_handler) +werkzeug_logger.setLevel(logging.WARNING) # Still suppress in main log +``` + +--- + +## Summary + +### What Changed +- ✅ Werkzeug logger set to WARNING level +- ✅ No more INFO logs for every request +- ✅ Still logs errors and warnings +- ✅ Application logs unchanged + +### Result +``` +Before: 100+ log lines per minute (polling) +After: 0-5 log lines per minute (only important events) +``` + +### To Revert +```python +# Show all requests again +werkzeug_logger.setLevel(logging.INFO) +``` + +--- + +## Best Practices + +1. **Production**: Use WARNING or ERROR for werkzeug +2. **Development**: Use INFO for debugging +3. **Polling Endpoints**: Always suppress or use separate log file +4. **Application Logs**: Keep at INFO or DEBUG as needed +5. **Third-Party**: Suppress noisy libraries + +--- + +## Testing + +After the change, you should see: +``` +✅ No more werkzeug INFO logs +✅ Application logs still visible +✅ Errors still logged +✅ Clean console output +``` + +If you need to see requests for debugging: +```python +# Temporarily enable +logging.getLogger('werkzeug').setLevel(logging.INFO) +``` diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py index 79a0c85..48ead9d 100644 --- a/ANNOTATE/core/real_training_adapter.py +++ b/ANNOTATE/core/real_training_adapter.py @@ -615,7 +615,7 @@ class RealTrainingAdapter: # Show breakdown of before/after before_count = sum(1 for s in negative_samples if 'before' in str(s.get('timestamp', ''))) after_count = len(negative_samples) - before_count - logger.info(f" -> {before_count} beforesignal, {after_count} after signal") + logger.info(f" -> {before_count} before signal, {after_count} after signal") except Exception as e: logger.error(f" Error preparing test case {i+1}: {e}") @@ -1413,12 +1413,17 @@ class RealTrainingAdapter: result = trainer.train_step(batch) if result is not None: - epoch_loss += result.get('total_loss', 0.0) - epoch_accuracy += result.get('accuracy', 0.0) + batch_loss = result.get('total_loss', 0.0) + batch_accuracy = result.get('accuracy', 0.0) + epoch_loss += batch_loss + epoch_accuracy += batch_accuracy num_batches += 1 - - if (i + 1) % 100 == 0: - logger.info(f" Batch {i + 1}/{len(converted_batches)}, Loss: {result.get('total_loss', 0.0):.6f}, Accuracy: {result.get('accuracy', 0.0):.2%}") + + # Log first batch and every 100th batch for debugging + if (i + 1) == 1 or (i + 1) % 100 == 0: + logger.info(f" Batch {i + 1}/{len(converted_batches)}, Loss: {batch_loss:.6f}, Accuracy: {batch_accuracy:.4f}") + else: + logger.warning(f" Batch {i + 1} returned None result - skipping") except Exception as e: logger.error(f" Error in batch {i + 1}: {e}") diff --git a/ANNOTATE/web/app.py b/ANNOTATE/web/app.py index 733cba5..2e44925 100644 --- a/ANNOTATE/web/app.py +++ b/ANNOTATE/web/app.py @@ -130,6 +130,10 @@ class AnnotationDashboard: static_folder='static' ) + # Suppress werkzeug request logs (reduce noise from polling endpoints) + werkzeug_logger = logging.getLogger('werkzeug') + werkzeug_logger.setLevel(logging.WARNING) # Only show warnings and errors, not INFO + # Initialize Dash app (optional component) self.app = Dash( __name__, @@ -1125,6 +1129,90 @@ class AnnotationDashboard: } }) + # Live Training API Endpoints + @self.server.route('/api/live-training/start', methods=['POST']) + def start_live_training(): + """Start live inference and training mode""" + try: + if not self.orchestrator: + return jsonify({ + 'success': False, + 'error': 'Orchestrator not available' + }), 500 + + if self.orchestrator.start_live_training(): + return jsonify({ + 'success': True, + 'status': 'started', + 'message': 'Live training mode started' + }) + else: + return jsonify({ + 'success': False, + 'error': 'Failed to start live training' + }), 500 + except Exception as e: + logger.error(f"Error starting live training: {e}") + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + + @self.server.route('/api/live-training/stop', methods=['POST']) + def stop_live_training(): + """Stop live inference and training mode""" + try: + if not self.orchestrator: + return jsonify({ + 'success': False, + 'error': 'Orchestrator not available' + }), 500 + + if self.orchestrator.stop_live_training(): + return jsonify({ + 'success': True, + 'status': 'stopped', + 'message': 'Live training mode stopped' + }) + else: + return jsonify({ + 'success': False, + 'error': 'Failed to stop live training' + }), 500 + except Exception as e: + logger.error(f"Error stopping live training: {e}") + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + + @self.server.route('/api/live-training/status', methods=['GET']) + def get_live_training_status(): + """Get live training status and statistics""" + try: + if not self.orchestrator: + return jsonify({ + 'success': False, + 'active': False, + 'error': 'Orchestrator not available' + }) + + is_active = self.orchestrator.is_live_training_active() + stats = self.orchestrator.get_live_training_stats() if is_active else {} + + return jsonify({ + 'success': True, + 'active': is_active, + 'stats': stats + }) + except Exception as e: + logger.error(f"Error getting live training status: {e}") + return jsonify({ + 'success': False, + 'active': False, + 'error': str(e) + }) + @self.server.route('/api/available-models', methods=['GET']) def get_available_models(): """Get list of available models with their load status""" diff --git a/CHECKPOINT_STRATEGY.md b/CHECKPOINT_STRATEGY.md new file mode 100644 index 0000000..c0baa64 --- /dev/null +++ b/CHECKPOINT_STRATEGY.md @@ -0,0 +1,404 @@ +# Checkpoint Strategy + +## Current System + +### ✅ What Exists + +The system has a sophisticated checkpoint management system in `utils/checkpoint_manager.py`: + +1. **Automatic Saving**: Checkpoints saved with metadata +2. **Performance Tracking**: Tracks metrics (loss, accuracy, reward) +3. **Best Checkpoint Selection**: Loads best performing checkpoint +4. **Automatic Cleanup**: Keeps only top N checkpoints +5. **Database Integration**: Metadata stored in database for fast access + +### How It Works + +```python +# Checkpoint Manager Configuration +max_checkpoints = 10 # Keep top 10 checkpoints +metric_name = "accuracy" # Rank by accuracy (or loss, reward) +checkpoint_dir = "models/checkpoints" +``` + +--- + +## Checkpoint Saving Logic + +### When Checkpoints Are Saved + +**Current Behavior**: Checkpoints are saved at **fixed intervals**, not based on performance improvement. + +```python +# Example from DQN Agent +def save_checkpoint(self, episode_reward: float, force_save: bool = False): + """Save checkpoint if performance improved or forced""" + + # Save every N episodes (e.g., every 100 episodes) + if self.episode_count % 100 == 0 or force_save: + save_checkpoint( + model=self.policy_net, + model_name=self.model_name, + model_type="dqn", + performance_metrics={ + 'loss': self.current_loss, + 'reward': episode_reward, + 'accuracy': self.accuracy + } + ) +``` + +### Cleanup Logic + +After saving, the system automatically cleans up: + +```python +def _cleanup_checkpoints(self, model_name: str): + """ + Keep only the best N checkpoints + + Process: + 1. Load all checkpoint metadata + 2. Sort by metric (accuracy/loss/reward) + 3. Keep top N (default: 10) + 4. Delete the rest + """ + + # Sort by metric (highest first for accuracy, lowest for loss) + checkpoints.sort(key=lambda x: x['metrics'][metric_name], reverse=True) + + # Keep only top N + checkpoints_to_keep = checkpoints[:max_checkpoints] + checkpoints_to_delete = checkpoints[max_checkpoints:] + + # Delete old checkpoints + for checkpoint in checkpoints_to_delete: + os.remove(checkpoint_path) +``` + +--- + +## Recommended Strategy + +### Option 1: Save Every Batch, Keep Best (Current + Enhancement) + +**Pros**: +- Never miss a good checkpoint +- Automatic cleanup keeps disk usage low +- Simple to implement + +**Cons**: +- High I/O overhead (saving every batch) +- Slower training (disk writes) + +**Implementation**: +```python +def train_step(self, batch): + # Train + result = trainer.train_step(batch) + + # Save checkpoint after EVERY batch + save_checkpoint( + model=self.model, + model_name="transformer", + model_type="transformer", + performance_metrics={ + 'loss': result['total_loss'], + 'accuracy': result['accuracy'] + } + ) + # Cleanup automatically keeps only best 10 +``` + +**Disk Usage**: ~10 checkpoints × 200MB = 2GB (manageable) + +--- + +### Option 2: Save Only If Better (Recommended) + +**Pros**: +- Minimal I/O overhead +- Only saves improvements +- Faster training + +**Cons**: +- Need to track best performance +- Slightly more complex + +**Implementation**: +```python +class TrainingSession: + def __init__(self): + self.best_loss = float('inf') + self.best_accuracy = 0.0 + self.checkpoints_saved = 0 + +def train_step(self, batch): + # Train + result = trainer.train_step(batch) + + # Check if performance improved + current_loss = result['total_loss'] + current_accuracy = result['accuracy'] + + # Save if better (lower loss OR higher accuracy) + if current_loss < self.best_loss or current_accuracy > self.best_accuracy: + logger.info(f"Performance improved! Loss: {current_loss:.4f} (best: {self.best_loss:.4f}), " + f"Accuracy: {current_accuracy:.2%} (best: {self.best_accuracy:.2%})") + + save_checkpoint( + model=self.model, + model_name="transformer", + model_type="transformer", + performance_metrics={ + 'loss': current_loss, + 'accuracy': current_accuracy + } + ) + + # Update best metrics + self.best_loss = min(self.best_loss, current_loss) + self.best_accuracy = max(self.best_accuracy, current_accuracy) + self.checkpoints_saved += 1 +``` + +--- + +### Option 3: Hybrid Approach (Best of Both) + +**Strategy**: +- Save if performance improved (Option 2) +- Also save every N batches as backup (Option 1) +- Keep best 10 checkpoints + +**Implementation**: +```python +def train_step(self, batch, batch_num): + result = trainer.train_step(batch) + + current_loss = result['total_loss'] + current_accuracy = result['accuracy'] + + # Condition 1: Performance improved + performance_improved = ( + current_loss < self.best_loss or + current_accuracy > self.best_accuracy + ) + + # Condition 2: Regular interval (every 100 batches) + regular_interval = (batch_num % 100 == 0) + + # Save if either condition is met + if performance_improved or regular_interval: + reason = "improved" if performance_improved else "interval" + logger.info(f"Saving checkpoint ({reason}): loss={current_loss:.4f}, acc={current_accuracy:.2%}") + + save_checkpoint( + model=self.model, + model_name="transformer", + model_type="transformer", + performance_metrics={ + 'loss': current_loss, + 'accuracy': current_accuracy + }, + training_metadata={ + 'batch_num': batch_num, + 'reason': reason, + 'epoch': self.current_epoch + } + ) + + # Update best metrics + if performance_improved: + self.best_loss = min(self.best_loss, current_loss) + self.best_accuracy = max(self.best_accuracy, current_accuracy) +``` + +--- + +## Implementation for ANNOTATE Training + +### Current Code Location + +In `ANNOTATE/core/real_training_adapter.py`, the training loop is: + +```python +def _train_transformer_real(self, session, training_data): + # ... setup ... + + for epoch in range(session.total_epochs): + for i, batch in enumerate(converted_batches): + result = trainer.train_step(batch) + + # ← ADD CHECKPOINT LOGIC HERE +``` + +### Recommended Addition + +```python +def _train_transformer_real(self, session, training_data): + # Initialize best metrics + best_loss = float('inf') + best_accuracy = 0.0 + checkpoints_saved = 0 + + for epoch in range(session.total_epochs): + for i, batch in enumerate(converted_batches): + result = trainer.train_step(batch) + + if result is not None: + current_loss = result.get('total_loss', float('inf')) + current_accuracy = result.get('accuracy', 0.0) + + # Check if performance improved + performance_improved = ( + current_loss < best_loss or + current_accuracy > best_accuracy + ) + + # Save every 100 batches OR if improved + should_save = performance_improved or (i % 100 == 0 and i > 0) + + if should_save: + try: + # Save checkpoint + from utils.checkpoint_manager import save_checkpoint + + checkpoint_metadata = save_checkpoint( + model=self.orchestrator.primary_transformer, + model_name="transformer", + model_type="transformer", + performance_metrics={ + 'loss': current_loss, + 'accuracy': current_accuracy, + 'action_loss': result.get('action_loss', 0.0), + 'price_loss': result.get('price_loss', 0.0) + }, + training_metadata={ + 'epoch': epoch + 1, + 'batch': i + 1, + 'total_batches': len(converted_batches), + 'training_session': session.training_id, + 'reason': 'improved' if performance_improved else 'interval' + } + ) + + if checkpoint_metadata: + checkpoints_saved += 1 + reason = "improved" if performance_improved else "interval" + logger.info(f" Checkpoint saved ({reason}): {checkpoint_metadata.checkpoint_id}") + logger.info(f" Loss: {current_loss:.4f}, Accuracy: {current_accuracy:.2%}") + + # Update best metrics + if performance_improved: + best_loss = min(best_loss, current_loss) + best_accuracy = max(best_accuracy, current_accuracy) + logger.info(f" New best! Loss: {best_loss:.4f}, Accuracy: {best_accuracy:.2%}") + + except Exception as e: + logger.error(f" Error saving checkpoint: {e}") + + logger.info(f" Training complete: {checkpoints_saved} checkpoints saved") + logger.info(f" Best loss: {best_loss:.4f}, Best accuracy: {best_accuracy:.2%}") +``` + +--- + +## Configuration + +### Checkpoint Settings + +```python +# In orchestrator initialization +checkpoint_manager = get_checkpoint_manager( + checkpoint_dir="models/checkpoints", + max_checkpoints=10, # Keep top 10 checkpoints + metric_name="accuracy" # Rank by accuracy (or "loss") +) +``` + +### Tuning Parameters + +| Parameter | Conservative | Balanced | Aggressive | +|-----------|-------------|----------|------------| +| `max_checkpoints` | 20 | 10 | 5 | +| `save_interval` | 50 batches | 100 batches | 200 batches | +| `improvement_threshold` | 0.1% | 0.5% | 1.0% | + +**Conservative**: Save more often, keep more checkpoints (safer, more disk) +**Balanced**: Default settings (recommended) +**Aggressive**: Save less often, keep fewer checkpoints (faster, less disk) + +--- + +## Disk Usage + +### Per Checkpoint + +| Model | Size | Notes | +|-------|------|-------| +| Transformer (46M params) | ~200MB | Full model + optimizer state | +| CNN | ~50MB | Smaller model | +| DQN | ~100MB | Medium model | + +### Total Storage + +``` +10 checkpoints × 200MB = 2GB per model +3 models × 2GB = 6GB total + +With metadata and backups: ~8GB +``` + +**Recommendation**: Keep 10 checkpoints (2GB per model is manageable) + +--- + +## Monitoring + +### Checkpoint Logs + +``` +INFO - Checkpoint saved (improved): transformer_20251031_142530 +INFO - Loss: 0.234, Accuracy: 78.5% +INFO - New best! Loss: 0.234, Accuracy: 78.5% + +INFO - Checkpoint saved (interval): transformer_20251031_142630 +INFO - Loss: 0.245, Accuracy: 77.2% + +INFO - Deleted 1 old checkpoints for transformer +``` + +### Dashboard Metrics + +``` +Checkpoints Saved: 15 +Best Loss: 0.234 +Best Accuracy: 78.5% +Disk Usage: 1.8GB / 2.0GB +Last Checkpoint: 2 minutes ago +``` + +--- + +## Summary + +### Current System +- ✅ Automatic checkpoint management +- ✅ Keeps best N checkpoints +- ✅ Database-backed metadata +- ❌ Saves at fixed intervals (not performance-based) + +### Recommended Enhancement +- ✅ Save when performance improves +- ✅ Also save every N batches as backup +- ✅ Keep best 10 checkpoints +- ✅ Minimal I/O overhead +- ✅ Never miss a good checkpoint + +### Implementation +Add checkpoint logic to `_train_transformer_real()` in `real_training_adapter.py` to save when: +1. Loss decreases OR accuracy increases (performance improved) +2. Every 100 batches (regular backup) + +The cleanup system automatically keeps only the best 10 checkpoints! diff --git a/LIVE_INFERENCE_TRAINING_GUIDE.md b/LIVE_INFERENCE_TRAINING_GUIDE.md new file mode 100644 index 0000000..834cf93 --- /dev/null +++ b/LIVE_INFERENCE_TRAINING_GUIDE.md @@ -0,0 +1,480 @@ +# Live Inference & Training Mode Guide + +## Overview + +The system has an `EnhancedRealtimeTrainingSystem` that can perform: +- **Live Inference**: Predict next candle every second +- **Retrospective Training**: Train on previous candle once result is known +- **Multi-Timeframe**: Process 1s, 1m, 1h, 1d candles independently + +## Current Status + +### ✅ Available +- `EnhancedRealtimeTrainingSystem` class exists in `NN/training/enhanced_realtime_training.py` +- Comprehensive feature engineering +- Multi-model support (DQN, CNN, COB RL) +- Prediction tracking database +- Experience replay buffers + +### ❌ Not Enabled +- Not instantiated in orchestrator +- No integration with main trading loop +- No UI controls to start/stop + +--- + +## Architecture + +### Live Inference Flow + +``` +Every 1 second: +┌─────────────────────────────────────────┐ +│ 1. Fetch Latest Data │ +│ - 1s candle (just closed) │ +│ - 1m candle (if minute boundary) │ +│ - 1h candle (if hour boundary) │ +│ - 1d candle (if day boundary) │ +└──────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 2. Make Predictions │ +│ - Next 1s candle OHLCV │ +│ - Next 1m candle OHLCV (if needed) │ +│ - Trading action (BUY/SELL/HOLD) │ +│ - Confidence score │ +└──────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 3. Store Predictions │ +│ - Save to prediction_database │ +│ - Track prediction_id │ +│ - Wait for resolution │ +└─────────────────────────────────────────┘ +``` + +### Retrospective Training Flow + +``` +Every 1 second (after candle closes): +┌─────────────────────────────────────────┐ +│ 1. Get Previous Candle Result │ +│ - Actual OHLCV values │ +│ - Price change │ +│ - Volume │ +└──────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 2. Resolve Predictions │ +│ - Compare predicted vs actual │ +│ - Calculate reward/loss │ +│ - Update prediction accuracy │ +└──────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 3. Create Training Experience │ +│ - State: market data before candle │ +│ - Action: predicted action │ +│ - Reward: based on accuracy │ +│ - Next State: market data after │ +└──────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 4. Train Models (if enough samples) │ +│ - Batch training (32-64 samples) │ +│ - Update model weights │ +│ - Save checkpoint │ +└─────────────────────────────────────────┘ +``` + +--- + +## Implementation Plan + +### Phase 1: Enable Realtime Training System + +#### 1.1 Initialize in Orchestrator + +```python +# In core/orchestrator.py __init__() + +if ENHANCED_TRAINING_AVAILABLE: + self.enhanced_training_system = EnhancedRealtimeTrainingSystem( + orchestrator=self, + data_provider=self.data_provider, + dashboard=None # Optional dashboard integration + ) + logger.info("EnhancedRealtimeTrainingSystem initialized") +else: + self.enhanced_training_system = None + logger.warning("EnhancedRealtimeTrainingSystem not available") +``` + +#### 1.2 Add Start/Stop Methods + +```python +# In core/orchestrator.py + +def start_live_training(self): + """Start live inference and training mode""" + if self.enhanced_training_system: + self.enhanced_training_system.start_training() + logger.info("Live training mode started") + return True + else: + logger.error("Enhanced training system not available") + return False + +def stop_live_training(self): + """Stop live inference and training mode""" + if self.enhanced_training_system: + self.enhanced_training_system.stop_training() + logger.info("Live training mode stopped") + return True + return False + +def is_live_training_active(self) -> bool: + """Check if live training is active""" + if self.enhanced_training_system: + return self.enhanced_training_system.is_training + return False +``` + +### Phase 2: Implement Prediction & Training Loop + +#### 2.1 Main Loop (runs every 1 second) + +```python +# In EnhancedRealtimeTrainingSystem + +def _live_inference_loop(self): + """Main loop for live inference and training""" + while self.is_training: + try: + current_time = time.time() + + # 1. Check which timeframes need processing + timeframes_to_process = self._get_active_timeframes(current_time) + + for timeframe in timeframes_to_process: + # 2. Make prediction for next candle + prediction = self._make_next_candle_prediction(timeframe) + + # 3. Store prediction + if prediction: + self._store_prediction(prediction) + + # 4. Resolve previous predictions + self._resolve_timeframe_predictions(timeframe) + + # 5. Train on resolved predictions + if self._should_train(timeframe): + self._train_on_timeframe(timeframe) + + # Sleep until next second + elapsed = time.time() - current_time + sleep_time = max(0, 1.0 - elapsed) + time.sleep(sleep_time) + + except Exception as e: + logger.error(f"Error in live inference loop: {e}") + time.sleep(1) +``` + +#### 2.2 Prediction Method + +```python +def _make_next_candle_prediction(self, timeframe: str) -> Dict: + """ + Predict next candle OHLCV values + + Returns: + { + 'timeframe': '1s', + 'timestamp': datetime, + 'predicted_open': float, + 'predicted_high': float, + 'predicted_low': float, + 'predicted_close': float, + 'predicted_volume': float, + 'action': 'BUY'|'SELL'|'HOLD', + 'confidence': float + } + """ + # Get current market state (600 candles) + market_state = self._get_market_state(timeframe) + + # Get model prediction + if self.orchestrator.primary_transformer: + output = self.orchestrator.primary_transformer(market_state) + + # Extract next candle prediction + next_candle = output['next_candles'][timeframe] + action_probs = output['action_probs'] + + return { + 'timeframe': timeframe, + 'timestamp': datetime.now(), + 'predicted_open': next_candle[0].item(), + 'predicted_high': next_candle[1].item(), + 'predicted_low': next_candle[2].item(), + 'predicted_close': next_candle[3].item(), + 'predicted_volume': next_candle[4].item(), + 'action': ['HOLD', 'BUY', 'SELL'][torch.argmax(action_probs).item()], + 'confidence': torch.max(action_probs).item() + } + + return None +``` + +#### 2.3 Training Method + +```python +def _train_on_timeframe(self, timeframe: str): + """ + Train model on resolved predictions for this timeframe + + Process: + 1. Get resolved predictions (predicted vs actual) + 2. Create training batches + 3. Calculate loss + 4. Update model weights + 5. Save checkpoint (if needed) + """ + # Get resolved predictions + resolved = self._get_resolved_predictions(timeframe, limit=100) + + if len(resolved) < 32: # Need minimum batch size + return + + # Create training batches + batches = self._create_training_batches(resolved) + + # Train model + if self.orchestrator.primary_transformer_trainer: + trainer = self.orchestrator.primary_transformer_trainer + + for batch in batches: + result = trainer.train_step(batch) + + # Log progress + if result: + logger.debug(f"Trained on {timeframe}: loss={result['total_loss']:.4f}") + + # Save checkpoint every N batches + if self.training_iteration % 100 == 0: + self._save_checkpoint(timeframe) +``` + +--- + +## Configuration + +### Training Intervals + +```python +training_config = { + # Inference intervals (how often to predict) + 'inference_1s': 1, # Every 1 second + 'inference_1m': 60, # Every 1 minute + 'inference_1h': 3600, # Every 1 hour + 'inference_1d': 86400, # Every 1 day + + # Training intervals (how often to train) + 'training_1s': 10, # Train every 10 seconds (10 samples) + 'training_1m': 300, # Train every 5 minutes (5 samples) + 'training_1h': 3600, # Train every 1 hour (1 sample) + 'training_1d': 86400, # Train every 1 day (1 sample) + + # Batch sizes + 'batch_size_1s': 32, + 'batch_size_1m': 16, + 'batch_size_1h': 8, + 'batch_size_1d': 4, + + # Buffer sizes + 'buffer_size_1s': 1000, + 'buffer_size_1m': 500, + 'buffer_size_1h': 200, + 'buffer_size_1d': 100 +} +``` + +### Performance Targets + +| Timeframe | Predictions/Hour | Training/Hour | GPU Load | Memory | +|-----------|------------------|---------------|----------|--------| +| 1s | 3,600 | 360 (every 10s) | 30-50% | 2GB | +| 1m | 60 | 12 (every 5m) | 10-20% | 1GB | +| 1h | 1 | 1 (every 1h) | 5-10% | 500MB | +| 1d | 0.04 | 0.04 (every 1d) | <5% | 200MB | + +--- + +## Database Schema + +### Predictions Table + +```sql +CREATE TABLE predictions ( + prediction_id INTEGER PRIMARY KEY, + model_name VARCHAR, + symbol VARCHAR, + timeframe VARCHAR, + timestamp BIGINT, + + -- Predicted values + predicted_open DOUBLE, + predicted_high DOUBLE, + predicted_low DOUBLE, + predicted_close DOUBLE, + predicted_volume DOUBLE, + predicted_action VARCHAR, + confidence DOUBLE, + + -- Actual values (filled when resolved) + actual_open DOUBLE, + actual_high DOUBLE, + actual_low DOUBLE, + actual_close DOUBLE, + actual_volume DOUBLE, + + -- Accuracy metrics + price_error DOUBLE, + volume_error DOUBLE, + action_correct BOOLEAN, + reward DOUBLE, + + -- Status + status VARCHAR, -- 'pending', 'resolved', 'expired' + resolved_at BIGINT +); +``` + +--- + +## UI Integration + +### Dashboard Controls + +```html + +
| Time | +TF | +Predicted | +Actual | +Error | +Action | +
|---|