#!/usr/bin/env python3 """ Test Training Data Collection System This script demonstrates and tests the comprehensive training data collection system with data validation, rapid change detection, and profitable setup replay. """ import asyncio import logging import numpy as np import pandas as pd import time from datetime import datetime, timedelta from pathlib import Path # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Import our training system components from core.training_data_collector import ( TrainingDataCollector, RapidChangeDetector, ModelInputPackage, TrainingOutcome, TrainingEpisode ) from core.cnn_training_pipeline import ( CNNPivotPredictor, CNNTrainer ) from core.data_provider import DataProvider def create_sample_ohlcv_data() -> Dict[str, pd.DataFrame]: """Create sample OHLCV data for testing""" timeframes = ['1s', '1m', '5m', '15m', '1h'] ohlcv_data = {} for timeframe in timeframes: # Create sample data dates = pd.date_range(start='2024-01-01', periods=300, freq='1min') # Generate realistic price data base_price = 3000.0 # ETH price price_data = [] current_price = base_price for i in range(300): # Add some randomness change = np.random.normal(0, 0.002) # 0.2% std dev current_price *= (1 + change) # OHLCV for this period open_price = current_price high_price = current_price * (1 + abs(np.random.normal(0, 0.001))) low_price = current_price * (1 - abs(np.random.normal(0, 0.001))) close_price = current_price * (1 + np.random.normal(0, 0.0005)) volume = np.random.uniform(100, 1000) price_data.append({ 'timestamp': dates[i], 'open': open_price, 'high': high_price, 'low': low_price, 'close': close_price, 'volume': volume }) current_price = close_price df = pd.DataFrame(price_data) df.set_index('timestamp', inplace=True) ohlcv_data[timeframe] = df return ohlcv_data def create_sample_tick_data() -> List[Dict[str, Any]]: """Create sample tick data for testing""" tick_data = [] base_price = 3000.0 for i in range(100): tick_data.append({ 'timestamp': datetime.now() - timedelta(seconds=100-i), 'price': base_price + np.random.normal(0, 5), 'volume': np.random.uniform(0.1, 10.0), 'side': 'buy' if np.random.random() > 0.5 else 'sell', 'trade_id': f'trade_{i}', 'quantity': np.random.uniform(0.1, 5.0) }) return tick_data def create_sample_cob_data() -> Dict[str, Any]: """Create sample COB data for testing""" return { 'timestamp': datetime.now(), 'bid_levels': [3000 - i for i in range(10)], 'ask_levels': [3000 + i for i in range(10)], 'bid_volumes': [np.random.uniform(1, 10) for _ in range(10)], 'ask_volumes': [np.random.uniform(1, 10) for _ in range(10)], 'spread': 1.0, 'depth': 100.0 } def test_rapid_change_detector(): """Test the rapid change detection system""" logger.info("=== Testing Rapid Change Detector ===") detector = RapidChangeDetector( velocity_threshold=0.5, volatility_multiplier=3.0, lookback_minutes=5 ) symbol = 'ETHUSDT' base_price = 3000.0 # Add normal price points for i in range(120): # 2 minutes of data timestamp = datetime.now() - timedelta(seconds=120-i) price = base_price + np.random.normal(0, 1) # Small changes detector.add_price_point(symbol, timestamp, price) # Check for rapid change (should be False) is_rapid, velocity, volatility_spike = detector.detect_rapid_change(symbol) logger.info(f"Normal conditions - Rapid change: {is_rapid}, Velocity: {velocity:.3f}") # Add rapid price change for i in range(60): # 1 minute of rapid changes timestamp = datetime.now() - timedelta(seconds=60-i) price = base_price + 50 + i * 0.5 # Rapid increase detector.add_price_point(symbol, timestamp, price) # Check for rapid change (should be True) is_rapid, velocity, volatility_spike = detector.detect_rapid_change(symbol) logger.info(f"Rapid change conditions - Rapid change: {is_rapid}, Velocity: {velocity:.3f}") return detector def test_training_data_collector(): """Test the training data collection system""" logger.info("=== Testing Training Data Collector ===") # Initialize collector collector = TrainingDataCollector( storage_dir="test_training_data", max_episodes_per_symbol=100 ) collector.start_collection() symbol = 'ETHUSDT' # Create sample data ohlcv_data = create_sample_ohlcv_data() tick_data = create_sample_tick_data() cob_data = create_sample_cob_data() technical_indicators = { 'rsi_14': 65.5, 'macd': 0.5, 'sma_20': 3000.0, 'ema_12': 3005.0, 'bollinger_upper': 3050.0, 'bollinger_lower': 2950.0 } pivot_points = [ {'timestamp': datetime.now(), 'price': 3020.0, 'type': 'high'}, {'timestamp': datetime.now() - timedelta(minutes=30), 'price': 2980.0, 'type': 'low'} ] # Create CNN and RL features cnn_features = np.random.randn(2000).astype(np.float32) rl_state = np.random.randn(2000).astype(np.float32) orchestrator_context = { 'market_session': 'european', 'volatility_regime': 'medium', 'trend_direction': 'uptrend' } # Collect training data episode_id = collector.collect_training_data( symbol=symbol, ohlcv_data=ohlcv_data, tick_data=tick_data, cob_data=cob_data, technical_indicators=technical_indicators, pivot_points=pivot_points, cnn_features=cnn_features, rl_state=rl_state, orchestrator_context=orchestrator_context ) logger.info(f"Created training episode: {episode_id}") # Test data validation validation_results = collector.validate_data_integrity() logger.info(f"Data integrity validation: {validation_results}") # Get statistics stats = collector.get_collection_statistics() logger.info(f"Collection statistics: {stats}") collector.stop_collection() return collector def test_cnn_training_pipeline(): """Test the CNN training pipeline""" logger.info("=== Testing CNN Training Pipeline ===") # Initialize CNN model and trainer model = CNNPivotPredictor( input_channels=10, sequence_length=300, hidden_dim=128, # Smaller for testing num_pivot_classes=3 ) trainer = CNNTrainer( model=model, device='cpu', # Use CPU for testing learning_rate=0.001, storage_dir="test_cnn_training" ) # Create sample training episodes episodes = [] for i in range(50): # Create 50 sample episodes # Create sample input package input_package = ModelInputPackage( timestamp=datetime.now() - timedelta(minutes=i), symbol='ETHUSDT', ohlcv_data=create_sample_ohlcv_data(), tick_data=create_sample_tick_data(), cob_data=create_sample_cob_data(), technical_indicators={'rsi': 50.0, 'macd': 0.0}, pivot_points=[], cnn_features=np.random.randn(2000).astype(np.float32), rl_state=np.random.randn(2000).astype(np.float32), orchestrator_context={} ) # Create sample outcome outcome = TrainingOutcome( input_package_hash=input_package.data_hash, timestamp=input_package.timestamp, symbol='ETHUSDT', price_change_1m=np.random.normal(0, 0.01), price_change_5m=np.random.normal(0, 0.02), price_change_15m=np.random.normal(0, 0.03), price_change_1h=np.random.normal(0, 0.05), max_profit_potential=abs(np.random.normal(0, 0.02)), max_loss_potential=abs(np.random.normal(0, 0.015)), optimal_entry_price=3000.0, optimal_exit_price=3000.0 + np.random.normal(0, 10), optimal_holding_time=timedelta(minutes=np.random.randint(5, 60)), is_profitable=np.random.random() > 0.4, # 60% profitable profitability_score=np.random.uniform(0.3, 1.0), risk_reward_ratio=np.random.uniform(1.0, 3.0), is_rapid_change=np.random.random() > 0.8, # 20% rapid changes change_velocity=np.random.uniform(0.1, 2.0), volatility_spike=np.random.random() > 0.9, outcome_validated=True ) # Create training episode episode = TrainingEpisode( episode_id=f"test_episode_{i}", input_package=input_package, model_predictions={}, actual_outcome=outcome, episode_type='normal' ) episodes.append(episode) # Test training on episodes results = trainer._train_on_episodes(episodes, training_mode='test_batch') logger.info(f"Training results: {results}") # Test profitable episode training profitable_results = trainer.train_on_profitable_episodes( symbol='ETHUSDT', min_profitability=0.7, max_episodes=20 ) logger.info(f"Profitable training results: {profitable_results}") # Get training statistics stats = trainer.get_training_statistics() logger.info(f"Training statistics: {stats}") return trainer def test_integration(): """Test the complete integration""" logger.info("=== Testing Complete Integration ===") try: # Test individual components detector = test_rapid_change_detector() collector = test_training_data_collector() trainer = test_cnn_training_pipeline() logger.info("✅ All components tested successfully!") # Test data flow logger.info("Testing data flow integration...") # Simulate real-time data collection and training symbol = 'ETHUSDT' # Collect multiple data points for i in range(10): ohlcv_data = create_sample_ohlcv_data() tick_data = create_sample_tick_data() cob_data = create_sample_cob_data() episode_id = collector.collect_training_data( symbol=symbol, ohlcv_data=ohlcv_data, tick_data=tick_data, cob_data=cob_data, technical_indicators={'rsi': 50.0 + i}, pivot_points=[], cnn_features=np.random.randn(2000).astype(np.float32), rl_state=np.random.randn(2000).astype(np.float32), orchestrator_context={} ) logger.info(f"Collected episode {i+1}: {episode_id}") time.sleep(0.1) # Small delay # Get final statistics final_stats = collector.get_collection_statistics() logger.info(f"Final collection statistics: {final_stats}") logger.info("✅ Integration test completed successfully!") return True except Exception as e: logger.error(f"❌ Integration test failed: {e}") import traceback logger.error(traceback.format_exc()) return False def main(): """Main test function""" logger.info("=" * 80) logger.info("COMPREHENSIVE TRAINING DATA COLLECTION SYSTEM TEST") logger.info("=" * 80) start_time = time.time() try: # Run integration test success = test_integration() end_time = time.time() duration = end_time - start_time logger.info("=" * 80) if success: logger.info("✅ ALL TESTS PASSED!") else: logger.info("❌ SOME TESTS FAILED!") logger.info(f"Test duration: {duration:.2f} seconds") logger.info("=" * 80) # Display summary logger.info("\n📊 SYSTEM CAPABILITIES DEMONSTRATED:") logger.info("✓ Comprehensive training data collection with validation") logger.info("✓ Rapid price change detection for premium training examples") logger.info("✓ Data integrity validation and completeness checking") logger.info("✓ CNN training pipeline with backpropagation data storage") logger.info("✓ Profitable episode prioritization and replay") logger.info("✓ Training session value calculation and ranking") logger.info("✓ Real-time data integration capabilities") logger.info("\n🎯 NEXT STEPS:") logger.info("1. Integrate with existing DataProvider for real market data") logger.info("2. Connect with actual CNN and RL models") logger.info("3. Implement outcome validation with real price data") logger.info("4. Add dashboard integration for monitoring") logger.info("5. Scale up for production deployment") except Exception as e: logger.error(f"❌ Test execution failed: {e}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": main()