load market data for training/inference
This commit is contained in:
@@ -28,6 +28,73 @@ except ImportError:
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_timestamp_to_utc(timestamp_str: str) -> datetime:
|
||||||
|
"""
|
||||||
|
Unified timestamp parser that handles all formats and ensures UTC timezone.
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- ISO format with timezone: '2025-10-27T14:00:00+00:00'
|
||||||
|
- ISO format with Z: '2025-10-27T14:00:00Z'
|
||||||
|
- Space-separated with seconds: '2025-10-27 14:00:00'
|
||||||
|
- Space-separated without seconds: '2025-10-27 14:00'
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timestamp_str: Timestamp string in various formats
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Timezone-aware datetime object in UTC
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If timestamp cannot be parsed
|
||||||
|
"""
|
||||||
|
if not timestamp_str:
|
||||||
|
raise ValueError("Empty timestamp string")
|
||||||
|
|
||||||
|
# Try ISO format first (handles T separator and timezone info)
|
||||||
|
if 'T' in timestamp_str or '+' in timestamp_str:
|
||||||
|
try:
|
||||||
|
# Handle 'Z' suffix (Zulu time = UTC)
|
||||||
|
if timestamp_str.endswith('Z'):
|
||||||
|
timestamp_str = timestamp_str[:-1] + '+00:00'
|
||||||
|
return datetime.fromisoformat(timestamp_str)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try space-separated formats
|
||||||
|
# Replace space with T for fromisoformat compatibility
|
||||||
|
if ' ' in timestamp_str:
|
||||||
|
try:
|
||||||
|
# Try parsing with fromisoformat after converting space to T
|
||||||
|
dt = datetime.fromisoformat(timestamp_str.replace(' ', 'T'))
|
||||||
|
# Make timezone-aware if naive
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try explicit format parsing as fallback
|
||||||
|
formats = [
|
||||||
|
'%Y-%m-%d %H:%M:%S', # With seconds
|
||||||
|
'%Y-%m-%d %H:%M', # Without seconds
|
||||||
|
'%Y-%m-%dT%H:%M:%S', # ISO without timezone
|
||||||
|
'%Y-%m-%dT%H:%M', # ISO without seconds or timezone
|
||||||
|
]
|
||||||
|
|
||||||
|
for fmt in formats:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(timestamp_str, fmt)
|
||||||
|
# Make timezone-aware
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If all parsing attempts fail
|
||||||
|
raise ValueError(f"Could not parse timestamp: '{timestamp_str}'")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TrainingSession:
|
class TrainingSession:
|
||||||
"""Real training session tracking"""
|
"""Real training session tracking"""
|
||||||
@@ -214,7 +281,10 @@ class RealTrainingAdapter:
|
|||||||
|
|
||||||
def _fetch_market_state_for_test_case(self, test_case: Dict) -> Dict:
|
def _fetch_market_state_for_test_case(self, test_case: Dict) -> Dict:
|
||||||
"""
|
"""
|
||||||
Fetch market state dynamically for a test case
|
Fetch market state dynamically for a test case from DuckDB storage
|
||||||
|
|
||||||
|
This fetches HISTORICAL data at the specific timestamp from the annotation,
|
||||||
|
not current/latest data.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
test_case: Test case dictionary with timestamp, symbol, etc.
|
test_case: Test case dictionary with timestamp, symbol, etc.
|
||||||
@@ -234,17 +304,32 @@ class RealTrainingAdapter:
|
|||||||
logger.warning("No timestamp in test case")
|
logger.warning("No timestamp in test case")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Parse timestamp
|
# Parse timestamp using unified parser
|
||||||
from datetime import datetime
|
try:
|
||||||
timestamp = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
|
timestamp = parse_timestamp_to_utc(timestamp_str)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not parse timestamp '{timestamp_str}': {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
# Get training config
|
# Get training config
|
||||||
training_config = test_case.get('training_config', {})
|
training_config = test_case.get('training_config', {})
|
||||||
timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
|
timeframes = training_config.get('timeframes', ['1s', '1m', '1h', '1d'])
|
||||||
context_window = training_config.get('context_window_minutes', 5)
|
context_window = training_config.get('context_window_minutes', 5)
|
||||||
|
negative_samples_window = training_config.get('negative_samples_window', 15) # ±15 candles
|
||||||
|
|
||||||
logger.info(f" Fetching market state for {symbol} at {timestamp}")
|
# Calculate extended time range to include negative sampling window
|
||||||
logger.info(f" Timeframes: {timeframes}, Context window: {context_window} minutes")
|
# For 1m timeframe: ±15 candles = ±15 minutes
|
||||||
|
# Add buffer to ensure we have enough data
|
||||||
|
extended_window_minutes = max(context_window, negative_samples_window + 10)
|
||||||
|
|
||||||
|
logger.info(f" Fetching HISTORICAL market state for {symbol} at {timestamp}")
|
||||||
|
logger.info(f" Timeframes: {timeframes}, Extended window: ±{extended_window_minutes} minutes")
|
||||||
|
logger.info(f" (Includes ±{negative_samples_window} candles for negative sampling)")
|
||||||
|
|
||||||
|
# Calculate time range for extended context window
|
||||||
|
from datetime import timedelta
|
||||||
|
start_time = timestamp - timedelta(minutes=extended_window_minutes)
|
||||||
|
end_time = timestamp + timedelta(minutes=extended_window_minutes)
|
||||||
|
|
||||||
# Fetch data for each timeframe
|
# Fetch data for each timeframe
|
||||||
market_state = {
|
market_state = {
|
||||||
@@ -253,14 +338,67 @@ class RealTrainingAdapter:
|
|||||||
'timeframes': {}
|
'timeframes': {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Try to get data from DuckDB storage first (historical data)
|
||||||
|
duckdb_storage = None
|
||||||
|
if hasattr(self.data_provider, 'duckdb_storage'):
|
||||||
|
duckdb_storage = self.data_provider.duckdb_storage
|
||||||
|
|
||||||
for timeframe in timeframes:
|
for timeframe in timeframes:
|
||||||
# Get historical data around the timestamp
|
df = None
|
||||||
# For now, just get the latest data (we can improve this later)
|
|
||||||
df = self.data_provider.get_historical_data(
|
# Calculate appropriate limit based on timeframe and window
|
||||||
symbol=symbol,
|
# We want enough candles to cover the extended window plus negative samples
|
||||||
timeframe=timeframe,
|
if timeframe == '1s':
|
||||||
limit=100 # Get 100 candles for context
|
limit = extended_window_minutes * 60 * 2 + 100 # 2x for safety + buffer
|
||||||
)
|
elif timeframe == '1m':
|
||||||
|
limit = extended_window_minutes * 2 + 50 # 2x for safety + buffer
|
||||||
|
elif timeframe == '1h':
|
||||||
|
limit = max(200, extended_window_minutes // 30) # At least 200 candles
|
||||||
|
elif timeframe == '1d':
|
||||||
|
limit = 200 # Fixed for daily
|
||||||
|
else:
|
||||||
|
limit = 300
|
||||||
|
|
||||||
|
# Try DuckDB storage first (has historical data)
|
||||||
|
if duckdb_storage:
|
||||||
|
try:
|
||||||
|
df = duckdb_storage.get_ohlcv_data(
|
||||||
|
symbol=symbol,
|
||||||
|
timeframe=timeframe,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
limit=limit,
|
||||||
|
direction='latest'
|
||||||
|
)
|
||||||
|
if df is not None and not df.empty:
|
||||||
|
logger.debug(f" {timeframe}: {len(df)} candles from DuckDB (historical)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f" {timeframe}: DuckDB query failed: {e}")
|
||||||
|
|
||||||
|
# Fallback to data_provider (might have cached data)
|
||||||
|
if df is None or df.empty:
|
||||||
|
try:
|
||||||
|
# Use get_historical_data_replay for time-specific data
|
||||||
|
replay_data = self.data_provider.get_historical_data_replay(
|
||||||
|
symbol=symbol,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
timeframes=[timeframe]
|
||||||
|
)
|
||||||
|
df = replay_data.get(timeframe)
|
||||||
|
if df is not None and not df.empty:
|
||||||
|
logger.debug(f" {timeframe}: {len(df)} candles from replay")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f" {timeframe}: Replay failed: {e}")
|
||||||
|
|
||||||
|
# Last resort: get latest data (not ideal but better than nothing)
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.warning(f" {timeframe}: No historical data found, using latest data as fallback")
|
||||||
|
df = self.data_provider.get_historical_data(
|
||||||
|
symbol=symbol,
|
||||||
|
timeframe=timeframe,
|
||||||
|
limit=limit # Use calculated limit
|
||||||
|
)
|
||||||
|
|
||||||
if df is not None and not df.empty:
|
if df is not None and not df.empty:
|
||||||
# Convert to dict format
|
# Convert to dict format
|
||||||
@@ -272,15 +410,15 @@ class RealTrainingAdapter:
|
|||||||
'close': df['close'].tolist(),
|
'close': df['close'].tolist(),
|
||||||
'volume': df['volume'].tolist()
|
'volume': df['volume'].tolist()
|
||||||
}
|
}
|
||||||
logger.debug(f" {timeframe}: {len(df)} candles")
|
logger.debug(f" {timeframe}: {len(df)} candles stored")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" {timeframe}: No data")
|
logger.warning(f" {timeframe}: No data available")
|
||||||
|
|
||||||
if market_state['timeframes']:
|
if market_state['timeframes']:
|
||||||
logger.info(f" Fetched market state with {len(market_state['timeframes'])} timeframes")
|
logger.info(f" Fetched market state with {len(market_state['timeframes'])} timeframes")
|
||||||
return market_state
|
return market_state
|
||||||
else:
|
else:
|
||||||
logger.warning(f" No market data fetched")
|
logger.warning(f" No market data fetched for any timeframe")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -441,23 +579,9 @@ class RealTrainingAdapter:
|
|||||||
logger.debug(" No holding period, skipping HOLD samples")
|
logger.debug(" No holding period, skipping HOLD samples")
|
||||||
return hold_samples
|
return hold_samples
|
||||||
|
|
||||||
# Parse entry timestamp - handle multiple formats
|
# Parse entry timestamp using unified parser
|
||||||
try:
|
try:
|
||||||
if 'T' in entry_timestamp:
|
entry_time = parse_timestamp_to_utc(entry_timestamp)
|
||||||
entry_time = datetime.fromisoformat(entry_timestamp.replace('Z', '+00:00'))
|
|
||||||
else:
|
|
||||||
# Try with seconds first, then without
|
|
||||||
try:
|
|
||||||
entry_time = datetime.strptime(entry_timestamp, '%Y-%m-%d %H:%M:%S')
|
|
||||||
except ValueError:
|
|
||||||
# Try without seconds
|
|
||||||
entry_time = datetime.strptime(entry_timestamp, '%Y-%m-%d %H:%M')
|
|
||||||
|
|
||||||
# Make timezone-aware
|
|
||||||
if pytz:
|
|
||||||
entry_time = entry_time.replace(tzinfo=pytz.UTC)
|
|
||||||
else:
|
|
||||||
entry_time = entry_time.replace(tzinfo=timezone.utc)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not parse entry timestamp '{entry_timestamp}': {e}")
|
logger.warning(f"Could not parse entry timestamp '{entry_timestamp}': {e}")
|
||||||
return hold_samples
|
return hold_samples
|
||||||
@@ -473,18 +597,9 @@ class RealTrainingAdapter:
|
|||||||
|
|
||||||
# Find all candles between entry and exit
|
# Find all candles between entry and exit
|
||||||
for idx, ts_str in enumerate(timestamps):
|
for idx, ts_str in enumerate(timestamps):
|
||||||
# Parse timestamp and ensure it's timezone-aware
|
# Parse timestamp using unified parser
|
||||||
try:
|
try:
|
||||||
if 'T' in ts_str:
|
ts = parse_timestamp_to_utc(ts_str)
|
||||||
ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
|
|
||||||
else:
|
|
||||||
ts = datetime.fromisoformat(ts_str.replace(' ', 'T'))
|
|
||||||
# Make timezone-aware if it's naive
|
|
||||||
if ts.tzinfo is None:
|
|
||||||
if pytz:
|
|
||||||
ts = ts.replace(tzinfo=pytz.UTC)
|
|
||||||
else:
|
|
||||||
ts = ts.replace(tzinfo=timezone.utc)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Could not parse timestamp '{ts_str}': {e}")
|
logger.debug(f"Could not parse timestamp '{ts_str}': {e}")
|
||||||
continue
|
continue
|
||||||
@@ -550,23 +665,9 @@ class RealTrainingAdapter:
|
|||||||
# Find the index of the signal timestamp
|
# Find the index of the signal timestamp
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# Parse signal timestamp - handle different formats
|
# Parse signal timestamp using unified parser
|
||||||
try:
|
try:
|
||||||
if 'T' in signal_timestamp:
|
signal_time = parse_timestamp_to_utc(signal_timestamp)
|
||||||
signal_time = datetime.fromisoformat(signal_timestamp.replace('Z', '+00:00'))
|
|
||||||
else:
|
|
||||||
# Try with seconds first, then without
|
|
||||||
try:
|
|
||||||
signal_time = datetime.strptime(signal_timestamp, '%Y-%m-%d %H:%M:%S')
|
|
||||||
except ValueError:
|
|
||||||
# Try without seconds
|
|
||||||
signal_time = datetime.strptime(signal_timestamp, '%Y-%m-%d %H:%M')
|
|
||||||
|
|
||||||
# Make timezone-aware
|
|
||||||
if pytz:
|
|
||||||
signal_time = signal_time.replace(tzinfo=pytz.UTC)
|
|
||||||
else:
|
|
||||||
signal_time = signal_time.replace(tzinfo=timezone.utc)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not parse signal timestamp '{signal_timestamp}': {e}")
|
logger.warning(f"Could not parse signal timestamp '{signal_timestamp}': {e}")
|
||||||
return negative_samples
|
return negative_samples
|
||||||
@@ -574,22 +675,8 @@ class RealTrainingAdapter:
|
|||||||
signal_index = None
|
signal_index = None
|
||||||
for idx, ts_str in enumerate(timestamps):
|
for idx, ts_str in enumerate(timestamps):
|
||||||
try:
|
try:
|
||||||
# Parse timestamp from market data - handle multiple formats
|
# Parse timestamp using unified parser
|
||||||
if 'T' in ts_str:
|
ts = parse_timestamp_to_utc(ts_str)
|
||||||
ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
|
|
||||||
else:
|
|
||||||
# Try with seconds first, then without
|
|
||||||
try:
|
|
||||||
ts = datetime.strptime(ts_str, '%Y-%m-%d %H:%M:%S')
|
|
||||||
except ValueError:
|
|
||||||
ts = datetime.strptime(ts_str, '%Y-%m-%d %H:%M')
|
|
||||||
|
|
||||||
# Make timezone-aware if naive
|
|
||||||
if ts.tzinfo is None:
|
|
||||||
if pytz:
|
|
||||||
ts = ts.replace(tzinfo=pytz.UTC)
|
|
||||||
else:
|
|
||||||
ts = ts.replace(tzinfo=timezone.utc)
|
|
||||||
|
|
||||||
# Match within 1 minute
|
# Match within 1 minute
|
||||||
if abs((ts - signal_time).total_seconds()) < 60:
|
if abs((ts - signal_time).total_seconds()) < 60:
|
||||||
@@ -1147,8 +1234,9 @@ class RealTrainingAdapter:
|
|||||||
future_prices = torch.tensor([future_price], dtype=torch.float32)
|
future_prices = torch.tensor([future_price], dtype=torch.float32)
|
||||||
|
|
||||||
# Trade success (1.0 if profitable, 0.0 otherwise)
|
# Trade success (1.0 if profitable, 0.0 otherwise)
|
||||||
|
# Shape must be [batch_size, 1] to match confidence head output
|
||||||
profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
|
profit_loss_pct = training_sample.get('profit_loss_pct', 0.0)
|
||||||
trade_success = torch.tensor([1.0 if profit_loss_pct > 0 else 0.0], dtype=torch.float32)
|
trade_success = torch.tensor([[1.0 if profit_loss_pct > 0 else 0.0]], dtype=torch.float32)
|
||||||
|
|
||||||
# Return batch dictionary
|
# Return batch dictionary
|
||||||
batch = {
|
batch = {
|
||||||
|
|||||||
@@ -971,20 +971,20 @@ class TradingTransformerTrainer:
|
|||||||
|
|
||||||
# Add confidence loss if available
|
# Add confidence loss if available
|
||||||
if 'confidence' in outputs and 'trade_success' in batch:
|
if 'confidence' in outputs and 'trade_success' in batch:
|
||||||
# Ensure both tensors have compatible shapes for BCELoss
|
# Both tensors should have shape [batch_size, 1]
|
||||||
# BCELoss requires both inputs to have the same shape
|
# confidence: [batch_size, 1] from confidence_head
|
||||||
confidence_pred = outputs['confidence'] # Keep as [batch_size, 1]
|
# trade_success: [batch_size, 1] from batch preparation
|
||||||
|
confidence_pred = outputs['confidence']
|
||||||
trade_target = batch['trade_success'].float()
|
trade_target = batch['trade_success'].float()
|
||||||
|
|
||||||
# Reshape target to match prediction shape [batch_size, 1]
|
# Verify shapes match (should both be [batch_size, 1])
|
||||||
if trade_target.dim() == 1:
|
|
||||||
trade_target = trade_target.unsqueeze(-1)
|
|
||||||
|
|
||||||
# Ensure both have same shape
|
|
||||||
if confidence_pred.shape != trade_target.shape:
|
if confidence_pred.shape != trade_target.shape:
|
||||||
# If shapes still don't match, squeeze both to 1D
|
logger.warning(f"Shape mismatch: confidence {confidence_pred.shape} vs target {trade_target.shape}")
|
||||||
confidence_pred = confidence_pred.view(-1)
|
# Reshape to match if needed
|
||||||
trade_target = trade_target.view(-1)
|
if trade_target.dim() == 1:
|
||||||
|
trade_target = trade_target.unsqueeze(-1)
|
||||||
|
if confidence_pred.dim() == 1:
|
||||||
|
confidence_pred = confidence_pred.unsqueeze(-1)
|
||||||
|
|
||||||
confidence_loss = self.confidence_criterion(confidence_pred, trade_target)
|
confidence_loss = self.confidence_criterion(confidence_pred, trade_target)
|
||||||
# Use addition instead of += to avoid inplace operation
|
# Use addition instead of += to avoid inplace operation
|
||||||
|
|||||||
@@ -1,355 +0,0 @@
|
|||||||
# Unified Data Storage System - Complete Implementation
|
|
||||||
|
|
||||||
## 🎉 Project Complete!
|
|
||||||
|
|
||||||
The unified data storage system has been successfully implemented and integrated into the existing DataProvider.
|
|
||||||
|
|
||||||
## Completed Tasks (8 out of 10)
|
|
||||||
|
|
||||||
### Task 1: TimescaleDB Schema and Infrastructure
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_storage_schema.py` - Schema manager with migrations
|
|
||||||
- `scripts/setup_unified_storage.py` - Automated setup script
|
|
||||||
- `docs/UNIFIED_STORAGE_SETUP.md` - Setup documentation
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- 5 hypertables (OHLCV, order book, aggregations, imbalances, trades)
|
|
||||||
- 5 continuous aggregates for multi-timeframe data
|
|
||||||
- 15+ optimized indexes
|
|
||||||
- Compression policies (>80% compression)
|
|
||||||
- Retention policies (30 days to 2 years)
|
|
||||||
|
|
||||||
### Task 2: Data Models and Validation
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_data_models.py` - Data structures
|
|
||||||
- `core/unified_data_validator.py` - Validation logic
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- `InferenceDataFrame` - Complete inference data
|
|
||||||
- `OrderBookDataFrame` - Order book with imbalances
|
|
||||||
- `OHLCVCandle`, `TradeEvent` - Individual data types
|
|
||||||
- Comprehensive validation and sanitization
|
|
||||||
|
|
||||||
### Task 3: Cache Layer
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_cache_manager.py` - In-memory caching
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- <10ms read latency
|
|
||||||
- 5-minute rolling window
|
|
||||||
- Thread-safe operations
|
|
||||||
- Automatic eviction
|
|
||||||
- Statistics tracking
|
|
||||||
|
|
||||||
### Task 4: Database Connection and Query Layer
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_database_manager.py` - Connection pool and queries
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Async connection pooling
|
|
||||||
- Health monitoring
|
|
||||||
- Optimized query methods
|
|
||||||
- <100ms query latency
|
|
||||||
- Multi-timeframe support
|
|
||||||
|
|
||||||
### Task 5: Data Ingestion Pipeline
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_ingestion_pipeline.py` - Real-time ingestion
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Batch writes (100 items or 5 seconds)
|
|
||||||
- Data validation before storage
|
|
||||||
- Background flush worker
|
|
||||||
- >1000 ops/sec throughput
|
|
||||||
- Error handling and retry logic
|
|
||||||
|
|
||||||
### Task 6: Unified Data Provider API
|
|
||||||
**Files:**
|
|
||||||
- `core/unified_data_provider_extension.py` - Main API
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Single `get_inference_data()` endpoint
|
|
||||||
- Automatic cache/database routing
|
|
||||||
- Multi-timeframe data retrieval
|
|
||||||
- Order book data access
|
|
||||||
- Statistics tracking
|
|
||||||
|
|
||||||
### Task 7: Data Migration System
|
|
||||||
**Status:** Skipped (decided to drop existing Parquet data)
|
|
||||||
|
|
||||||
### Task 8: Integration with Existing DataProvider
|
|
||||||
**Files:**
|
|
||||||
- `core/data_provider.py` - Updated with unified storage methods
|
|
||||||
- `docs/UNIFIED_STORAGE_INTEGRATION.md` - Integration guide
|
|
||||||
- `examples/unified_storage_example.py` - Usage examples
|
|
||||||
|
|
||||||
**Features:**
|
|
||||||
- Seamless integration with existing code
|
|
||||||
- Backward compatible
|
|
||||||
- Opt-in unified storage
|
|
||||||
- Easy to enable/disable
|
|
||||||
|
|
||||||
## 📊 System Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────┐
|
|
||||||
│ Application Layer │
|
|
||||||
│ (Models, Backtesting, Annotation, etc.) │
|
|
||||||
└────────────────┬────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────────────┐
|
|
||||||
│ DataProvider (Existing) │
|
|
||||||
│ + Unified Storage Extension (New) │
|
|
||||||
└────────────────┬────────────────────────────┘
|
|
||||||
│
|
|
||||||
┌────────┴────────┐
|
|
||||||
▼ ▼
|
|
||||||
┌──────────────┐ ┌──────────────┐
|
|
||||||
│ Cache Layer │ │ Database │
|
|
||||||
│ (In-Memory) │ │ (TimescaleDB)│
|
|
||||||
│ │ │ │
|
|
||||||
│ - Last 5 min │ │ - Historical │
|
|
||||||
│ - <10ms read │ │ - <100ms read│
|
|
||||||
│ - Real-time │ │ - Compressed │
|
|
||||||
└──────────────┘ └──────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
### Performance
|
|
||||||
- Cache reads: <10ms
|
|
||||||
- Database queries: <100ms
|
|
||||||
- Ingestion: >1000 ops/sec
|
|
||||||
- Compression: >80%
|
|
||||||
|
|
||||||
### Reliability
|
|
||||||
- Data validation
|
|
||||||
- Error handling
|
|
||||||
- Health monitoring
|
|
||||||
- Statistics tracking
|
|
||||||
- Automatic reconnection
|
|
||||||
|
|
||||||
### Usability
|
|
||||||
- Single endpoint for all data
|
|
||||||
- Automatic routing (cache vs database)
|
|
||||||
- Type-safe interfaces
|
|
||||||
- Backward compatible
|
|
||||||
- Easy to integrate
|
|
||||||
|
|
||||||
## 📝 Quick Start
|
|
||||||
|
|
||||||
### 1. Setup Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python scripts/setup_unified_storage.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Enable in Code
|
|
||||||
|
|
||||||
```python
|
|
||||||
from core.data_provider import DataProvider
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
data_provider = DataProvider()
|
|
||||||
|
|
||||||
async def setup():
|
|
||||||
await data_provider.enable_unified_storage()
|
|
||||||
|
|
||||||
asyncio.run(setup())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Use Unified API
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Get real-time data (from cache)
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
|
|
||||||
# Get historical data (from database)
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=datetime(2024, 1, 15, 12, 30)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📚 Documentation
|
|
||||||
|
|
||||||
- **Setup Guide**: `docs/UNIFIED_STORAGE_SETUP.md`
|
|
||||||
- **Integration Guide**: `docs/UNIFIED_STORAGE_INTEGRATION.md`
|
|
||||||
- **Examples**: `examples/unified_storage_example.py`
|
|
||||||
- **Design Document**: `.kiro/specs/unified-data-storage/design.md`
|
|
||||||
- **Requirements**: `.kiro/specs/unified-data-storage/requirements.md`
|
|
||||||
|
|
||||||
## 🎯 Use Cases
|
|
||||||
|
|
||||||
### Real-Time Trading
|
|
||||||
```python
|
|
||||||
# Fast access to latest market data
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
price = data.get_latest_price()
|
|
||||||
```
|
|
||||||
|
|
||||||
### Backtesting
|
|
||||||
```python
|
|
||||||
# Historical data at any timestamp
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=target_time,
|
|
||||||
context_window_minutes=60
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Data Annotation
|
|
||||||
```python
|
|
||||||
# Retrieve data at specific timestamps for labeling
|
|
||||||
for timestamp in annotation_timestamps:
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=timestamp,
|
|
||||||
context_window_minutes=5
|
|
||||||
)
|
|
||||||
# Display and annotate
|
|
||||||
```
|
|
||||||
|
|
||||||
### Model Training
|
|
||||||
```python
|
|
||||||
# Get complete inference data for training
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=training_timestamp
|
|
||||||
)
|
|
||||||
|
|
||||||
features = {
|
|
||||||
'ohlcv': data.ohlcv_1m.to_numpy(),
|
|
||||||
'indicators': data.indicators,
|
|
||||||
'imbalances': data.imbalances.to_numpy()
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📈 Performance Metrics
|
|
||||||
|
|
||||||
### Cache Performance
|
|
||||||
- Hit Rate: >90% (typical)
|
|
||||||
- Read Latency: <10ms
|
|
||||||
- Capacity: 5 minutes of data
|
|
||||||
- Eviction: Automatic
|
|
||||||
|
|
||||||
### Database Performance
|
|
||||||
- Query Latency: <100ms (typical)
|
|
||||||
- Write Throughput: >1000 ops/sec
|
|
||||||
- Compression Ratio: >80%
|
|
||||||
- Storage: Optimized with TimescaleDB
|
|
||||||
|
|
||||||
### Ingestion Performance
|
|
||||||
- Validation: All data validated
|
|
||||||
- Batch Size: 100 items or 5 seconds
|
|
||||||
- Error Rate: <0.1% (typical)
|
|
||||||
- Retry: Automatic with backoff
|
|
||||||
|
|
||||||
## 🔧 Configuration
|
|
||||||
|
|
||||||
### Database Config (`config.yaml`)
|
|
||||||
```yaml
|
|
||||||
database:
|
|
||||||
host: localhost
|
|
||||||
port: 5432
|
|
||||||
name: trading_data
|
|
||||||
user: postgres
|
|
||||||
password: postgres
|
|
||||||
pool_size: 20
|
|
||||||
```
|
|
||||||
|
|
||||||
### Cache Config
|
|
||||||
```python
|
|
||||||
cache_manager = DataCacheManager(
|
|
||||||
cache_duration_seconds=300 # 5 minutes
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Ingestion Config
|
|
||||||
```python
|
|
||||||
ingestion_pipeline = DataIngestionPipeline(
|
|
||||||
batch_size=100,
|
|
||||||
batch_timeout_seconds=5.0
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🎓 Examples
|
|
||||||
|
|
||||||
Run the example script:
|
|
||||||
```bash
|
|
||||||
python examples/unified_storage_example.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This demonstrates:
|
|
||||||
1. Real-time data access
|
|
||||||
2. Historical data retrieval
|
|
||||||
3. Multi-timeframe queries
|
|
||||||
4. Order book data
|
|
||||||
5. Statistics tracking
|
|
||||||
|
|
||||||
## 🔍 Monitoring
|
|
||||||
|
|
||||||
### Get Statistics
|
|
||||||
```python
|
|
||||||
stats = data_provider.get_unified_storage_stats()
|
|
||||||
|
|
||||||
print(f"Cache hit rate: {stats['cache']['hit_rate_percent']}%")
|
|
||||||
print(f"DB queries: {stats['database']['total_queries']}")
|
|
||||||
print(f"Ingestion rate: {stats['ingestion']['total_ingested']}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Check Health
|
|
||||||
```python
|
|
||||||
if data_provider.is_unified_storage_enabled():
|
|
||||||
print(" Unified storage is running")
|
|
||||||
else:
|
|
||||||
print(" Unified storage is not enabled")
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚧 Remaining Tasks (Optional)
|
|
||||||
|
|
||||||
### Task 9: Performance Optimization
|
|
||||||
- Add detailed monitoring dashboards
|
|
||||||
- Implement query caching
|
|
||||||
- Optimize database indexes
|
|
||||||
- Add performance alerts
|
|
||||||
|
|
||||||
### Task 10: Documentation and Deployment
|
|
||||||
- Create video tutorials
|
|
||||||
- Add API reference documentation
|
|
||||||
- Create deployment guides
|
|
||||||
- Add monitoring setup
|
|
||||||
|
|
||||||
## 🎉 Success Metrics
|
|
||||||
|
|
||||||
**Completed**: 8 out of 10 major tasks (80%)
|
|
||||||
**Core Functionality**: 100% complete
|
|
||||||
**Integration**: Seamless with existing code
|
|
||||||
**Performance**: Meets all targets
|
|
||||||
**Documentation**: Comprehensive guides
|
|
||||||
**Examples**: Working code samples
|
|
||||||
|
|
||||||
## 🙏 Next Steps
|
|
||||||
|
|
||||||
The unified storage system is **production-ready** and can be used immediately:
|
|
||||||
|
|
||||||
1. **Setup Database**: Run `python scripts/setup_unified_storage.py`
|
|
||||||
2. **Enable in Code**: Call `await data_provider.enable_unified_storage()`
|
|
||||||
3. **Start Using**: Use `get_inference_data_unified()` for all data access
|
|
||||||
4. **Monitor**: Check statistics with `get_unified_storage_stats()`
|
|
||||||
|
|
||||||
## 📞 Support
|
|
||||||
|
|
||||||
For issues or questions:
|
|
||||||
1. Check documentation in `docs/`
|
|
||||||
2. Review examples in `examples/`
|
|
||||||
3. Check database setup: `python scripts/setup_unified_storage.py`
|
|
||||||
4. Review logs for errors
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Status**: Production Ready
|
|
||||||
**Version**: 1.0.0
|
|
||||||
**Last Updated**: 2024
|
|
||||||
**Completion**: 80% (8/10 tasks)
|
|
||||||
@@ -1,398 +0,0 @@
|
|||||||
# Unified Storage System Integration Guide
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The unified storage system has been integrated into the existing `DataProvider` class, providing a single endpoint for both real-time and historical data access.
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
**Single Endpoint**: One method for all data access
|
|
||||||
**Automatic Routing**: Cache for real-time, database for historical
|
|
||||||
**Backward Compatible**: All existing methods still work
|
|
||||||
**Opt-In**: Only enabled when explicitly initialized
|
|
||||||
**Fast**: <10ms cache reads, <100ms database queries
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### 1. Enable Unified Storage
|
|
||||||
|
|
||||||
```python
|
|
||||||
from core.data_provider import DataProvider
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Create DataProvider (existing code works as before)
|
|
||||||
data_provider = DataProvider()
|
|
||||||
|
|
||||||
# Enable unified storage system
|
|
||||||
async def setup():
|
|
||||||
success = await data_provider.enable_unified_storage()
|
|
||||||
if success:
|
|
||||||
print(" Unified storage enabled!")
|
|
||||||
else:
|
|
||||||
print(" Failed to enable unified storage")
|
|
||||||
|
|
||||||
asyncio.run(setup())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Get Real-Time Data (from cache)
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def get_realtime_data():
|
|
||||||
# Get latest real-time data (timestamp=None)
|
|
||||||
inference_data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
|
|
||||||
print(f"Symbol: {inference_data.symbol}")
|
|
||||||
print(f"Timestamp: {inference_data.timestamp}")
|
|
||||||
print(f"Latest price: {inference_data.get_latest_price()}")
|
|
||||||
print(f"Data source: {inference_data.data_source}") # 'cache'
|
|
||||||
print(f"Query latency: {inference_data.query_latency_ms}ms") # <10ms
|
|
||||||
|
|
||||||
# Check data completeness
|
|
||||||
if inference_data.has_complete_data():
|
|
||||||
print("✓ All required data present")
|
|
||||||
|
|
||||||
# Get data summary
|
|
||||||
summary = inference_data.get_data_summary()
|
|
||||||
print(f"OHLCV 1m rows: {summary['ohlcv_1m_rows']}")
|
|
||||||
print(f"Has orderbook: {summary['has_orderbook']}")
|
|
||||||
print(f"Imbalances rows: {summary['imbalances_rows']}")
|
|
||||||
|
|
||||||
asyncio.run(get_realtime_data())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Get Historical Data (from database)
|
|
||||||
|
|
||||||
```python
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
async def get_historical_data():
|
|
||||||
# Get historical data at specific timestamp
|
|
||||||
target_time = datetime.now() - timedelta(hours=1)
|
|
||||||
|
|
||||||
inference_data = await data_provider.get_inference_data_unified(
|
|
||||||
symbol='ETH/USDT',
|
|
||||||
timestamp=target_time,
|
|
||||||
context_window_minutes=5 # ±5 minutes of context
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Data source: {inference_data.data_source}") # 'database'
|
|
||||||
print(f"Query latency: {inference_data.query_latency_ms}ms") # <100ms
|
|
||||||
|
|
||||||
# Access multi-timeframe data
|
|
||||||
print(f"1s candles: {len(inference_data.ohlcv_1s)}")
|
|
||||||
print(f"1m candles: {len(inference_data.ohlcv_1m)}")
|
|
||||||
print(f"1h candles: {len(inference_data.ohlcv_1h)}")
|
|
||||||
|
|
||||||
# Access technical indicators
|
|
||||||
print(f"RSI: {inference_data.indicators.get('rsi_14')}")
|
|
||||||
print(f"MACD: {inference_data.indicators.get('macd')}")
|
|
||||||
|
|
||||||
# Access context data
|
|
||||||
if inference_data.context_data is not None:
|
|
||||||
print(f"Context data: {len(inference_data.context_data)} rows")
|
|
||||||
|
|
||||||
asyncio.run(get_historical_data())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Get Multi-Timeframe Data
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def get_multi_timeframe():
|
|
||||||
# Get multiple timeframes at once
|
|
||||||
multi_tf = await data_provider.get_multi_timeframe_data_unified(
|
|
||||||
symbol='ETH/USDT',
|
|
||||||
timeframes=['1m', '5m', '1h'],
|
|
||||||
limit=100
|
|
||||||
)
|
|
||||||
|
|
||||||
for timeframe, df in multi_tf.items():
|
|
||||||
print(f"{timeframe}: {len(df)} candles")
|
|
||||||
if not df.empty:
|
|
||||||
print(f" Latest close: {df.iloc[-1]['close_price']}")
|
|
||||||
|
|
||||||
asyncio.run(get_multi_timeframe())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Get Order Book Data
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def get_orderbook():
|
|
||||||
# Get order book with imbalances
|
|
||||||
orderbook = await data_provider.get_order_book_data_unified('ETH/USDT')
|
|
||||||
|
|
||||||
print(f"Mid price: {orderbook.mid_price}")
|
|
||||||
print(f"Spread: {orderbook.spread}")
|
|
||||||
print(f"Spread (bps): {orderbook.get_spread_bps()}")
|
|
||||||
|
|
||||||
# Get best bid/ask
|
|
||||||
best_bid = orderbook.get_best_bid()
|
|
||||||
best_ask = orderbook.get_best_ask()
|
|
||||||
print(f"Best bid: {best_bid}")
|
|
||||||
print(f"Best ask: {best_ask}")
|
|
||||||
|
|
||||||
# Get imbalance summary
|
|
||||||
imbalances = orderbook.get_imbalance_summary()
|
|
||||||
print(f"Imbalances: {imbalances}")
|
|
||||||
|
|
||||||
asyncio.run(get_orderbook())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 6. Get Statistics
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Get unified storage statistics
|
|
||||||
stats = data_provider.get_unified_storage_stats()
|
|
||||||
|
|
||||||
print("=== Cache Statistics ===")
|
|
||||||
print(f"Hit rate: {stats['cache']['hit_rate_percent']}%")
|
|
||||||
print(f"Total entries: {stats['cache']['total_entries']}")
|
|
||||||
|
|
||||||
print("\n=== Database Statistics ===")
|
|
||||||
print(f"Total queries: {stats['database']['total_queries']}")
|
|
||||||
print(f"Avg query time: {stats['database']['avg_query_time_ms']}ms")
|
|
||||||
|
|
||||||
print("\n=== Ingestion Statistics ===")
|
|
||||||
print(f"Total ingested: {stats['ingestion']['total_ingested']}")
|
|
||||||
print(f"Validation failures: {stats['ingestion']['validation_failures']}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Integration with Existing Code
|
|
||||||
|
|
||||||
### Backward Compatibility
|
|
||||||
|
|
||||||
All existing DataProvider methods continue to work:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Existing methods still work
|
|
||||||
df = data_provider.get_historical_data('ETH/USDT', '1m', limit=100)
|
|
||||||
price = data_provider.get_current_price('ETH/USDT')
|
|
||||||
features = data_provider.get_feature_matrix('ETH/USDT')
|
|
||||||
|
|
||||||
# New unified methods available alongside
|
|
||||||
inference_data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
```
|
|
||||||
|
|
||||||
### Gradual Migration
|
|
||||||
|
|
||||||
You can migrate to unified storage gradually:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Option 1: Use existing methods (no changes needed)
|
|
||||||
df = data_provider.get_historical_data('ETH/USDT', '1m')
|
|
||||||
|
|
||||||
# Option 2: Use unified storage for new features
|
|
||||||
inference_data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
```
|
|
||||||
|
|
||||||
## Use Cases
|
|
||||||
|
|
||||||
### 1. Real-Time Trading
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def realtime_trading_loop():
|
|
||||||
while True:
|
|
||||||
# Get latest market data (fast!)
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
|
|
||||||
# Make trading decision
|
|
||||||
if data.has_complete_data():
|
|
||||||
price = data.get_latest_price()
|
|
||||||
rsi = data.indicators.get('rsi_14', 50)
|
|
||||||
|
|
||||||
if rsi < 30:
|
|
||||||
print(f"Buy signal at {price}")
|
|
||||||
elif rsi > 70:
|
|
||||||
print(f"Sell signal at {price}")
|
|
||||||
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Backtesting
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def backtest_strategy(start_time, end_time):
|
|
||||||
current_time = start_time
|
|
||||||
|
|
||||||
while current_time < end_time:
|
|
||||||
# Get historical data at specific time
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=current_time,
|
|
||||||
context_window_minutes=60
|
|
||||||
)
|
|
||||||
|
|
||||||
# Run strategy
|
|
||||||
if data.has_complete_data():
|
|
||||||
# Your strategy logic here
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Move to next timestamp
|
|
||||||
current_time += timedelta(minutes=1)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Data Annotation
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def annotate_data(timestamps):
|
|
||||||
annotations = []
|
|
||||||
|
|
||||||
for timestamp in timestamps:
|
|
||||||
# Get data at specific timestamp
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
timestamp=timestamp,
|
|
||||||
context_window_minutes=5
|
|
||||||
)
|
|
||||||
|
|
||||||
# Display to user for annotation
|
|
||||||
# User marks buy/sell signals
|
|
||||||
annotation = {
|
|
||||||
'timestamp': timestamp,
|
|
||||||
'price': data.get_latest_price(),
|
|
||||||
'signal': 'buy', # User input
|
|
||||||
'data': data.to_dict()
|
|
||||||
}
|
|
||||||
annotations.append(annotation)
|
|
||||||
|
|
||||||
return annotations
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Model Training
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def prepare_training_data(symbol, start_time, end_time):
|
|
||||||
training_samples = []
|
|
||||||
|
|
||||||
current_time = start_time
|
|
||||||
while current_time < end_time:
|
|
||||||
# Get complete inference data
|
|
||||||
data = await data_provider.get_inference_data_unified(
|
|
||||||
symbol,
|
|
||||||
timestamp=current_time,
|
|
||||||
context_window_minutes=10
|
|
||||||
)
|
|
||||||
|
|
||||||
if data.has_complete_data():
|
|
||||||
# Extract features
|
|
||||||
features = {
|
|
||||||
'ohlcv_1m': data.ohlcv_1m.to_numpy(),
|
|
||||||
'indicators': data.indicators,
|
|
||||||
'imbalances': data.imbalances.to_numpy(),
|
|
||||||
'orderbook': data.orderbook_snapshot
|
|
||||||
}
|
|
||||||
|
|
||||||
training_samples.append(features)
|
|
||||||
|
|
||||||
current_time += timedelta(minutes=1)
|
|
||||||
|
|
||||||
return training_samples
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Database Configuration
|
|
||||||
|
|
||||||
Update `config.yaml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
database:
|
|
||||||
host: localhost
|
|
||||||
port: 5432
|
|
||||||
name: trading_data
|
|
||||||
user: postgres
|
|
||||||
password: postgres
|
|
||||||
pool_size: 20
|
|
||||||
```
|
|
||||||
|
|
||||||
### Setup Database
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run setup script
|
|
||||||
python scripts/setup_unified_storage.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Performance Tips
|
|
||||||
|
|
||||||
1. **Use Real-Time Endpoint for Latest Data**
|
|
||||||
```python
|
|
||||||
# Fast (cache)
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT')
|
|
||||||
|
|
||||||
# Slower (database)
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT', datetime.now())
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Batch Historical Queries**
|
|
||||||
```python
|
|
||||||
# Get multiple timeframes at once
|
|
||||||
multi_tf = await data_provider.get_multi_timeframe_data_unified(
|
|
||||||
'ETH/USDT',
|
|
||||||
['1m', '5m', '1h'],
|
|
||||||
limit=100
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Monitor Performance**
|
|
||||||
```python
|
|
||||||
stats = data_provider.get_unified_storage_stats()
|
|
||||||
print(f"Cache hit rate: {stats['cache']['hit_rate_percent']}%")
|
|
||||||
print(f"Avg query time: {stats['database']['avg_query_time_ms']}ms")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Unified Storage Not Available
|
|
||||||
|
|
||||||
```python
|
|
||||||
if not data_provider.is_unified_storage_enabled():
|
|
||||||
success = await data_provider.enable_unified_storage()
|
|
||||||
if not success:
|
|
||||||
print("Check database connection and configuration")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Slow Queries
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Check query latency
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT', timestamp)
|
|
||||||
if data.query_latency_ms > 100:
|
|
||||||
print(f"Slow query: {data.query_latency_ms}ms")
|
|
||||||
# Check database stats
|
|
||||||
stats = data_provider.get_unified_storage_stats()
|
|
||||||
print(stats['database'])
|
|
||||||
```
|
|
||||||
|
|
||||||
### Missing Data
|
|
||||||
|
|
||||||
```python
|
|
||||||
data = await data_provider.get_inference_data_unified('ETH/USDT', timestamp)
|
|
||||||
if not data.has_complete_data():
|
|
||||||
summary = data.get_data_summary()
|
|
||||||
print(f"Missing data: {summary}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## API Reference
|
|
||||||
|
|
||||||
### Main Methods
|
|
||||||
|
|
||||||
- `enable_unified_storage()` - Enable unified storage system
|
|
||||||
- `disable_unified_storage()` - Disable unified storage system
|
|
||||||
- `get_inference_data_unified()` - Get complete inference data
|
|
||||||
- `get_multi_timeframe_data_unified()` - Get multi-timeframe data
|
|
||||||
- `get_order_book_data_unified()` - Get order book with imbalances
|
|
||||||
- `get_unified_storage_stats()` - Get statistics
|
|
||||||
- `is_unified_storage_enabled()` - Check if enabled
|
|
||||||
|
|
||||||
### Data Models
|
|
||||||
|
|
||||||
- `InferenceDataFrame` - Complete inference data structure
|
|
||||||
- `OrderBookDataFrame` - Order book with imbalances
|
|
||||||
- `OHLCVCandle` - Single candlestick
|
|
||||||
- `TradeEvent` - Individual trade
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
For issues or questions:
|
|
||||||
1. Check database connection: `python scripts/setup_unified_storage.py`
|
|
||||||
2. Review logs for errors
|
|
||||||
3. Check statistics: `data_provider.get_unified_storage_stats()`
|
|
||||||
@@ -1,337 +0,0 @@
|
|||||||
# Unified Data Storage Setup Guide
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The unified data storage system consolidates all market data storage into a single TimescaleDB backend, replacing fragmented Parquet files, pickle files, and in-memory caches.
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
### 1. PostgreSQL with TimescaleDB
|
|
||||||
|
|
||||||
You need PostgreSQL 12+ with TimescaleDB extension installed.
|
|
||||||
|
|
||||||
#### Installation Options
|
|
||||||
|
|
||||||
**Option A: Docker (Recommended)**
|
|
||||||
```bash
|
|
||||||
docker run -d --name timescaledb \
|
|
||||||
-p 5432:5432 \
|
|
||||||
-e POSTGRES_PASSWORD=postgres \
|
|
||||||
-e POSTGRES_DB=trading_data \
|
|
||||||
timescale/timescaledb:latest-pg14
|
|
||||||
```
|
|
||||||
|
|
||||||
**Option B: Local Installation**
|
|
||||||
- Follow TimescaleDB installation guide: https://docs.timescale.com/install/latest/
|
|
||||||
- Create database: `createdb trading_data`
|
|
||||||
|
|
||||||
### 2. Python Dependencies
|
|
||||||
|
|
||||||
Ensure you have the required Python packages:
|
|
||||||
```bash
|
|
||||||
pip install asyncpg
|
|
||||||
```
|
|
||||||
|
|
||||||
## Database Configuration
|
|
||||||
|
|
||||||
Update your `config.yaml` with database connection details:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
database:
|
|
||||||
host: localhost
|
|
||||||
port: 5432
|
|
||||||
name: trading_data
|
|
||||||
user: postgres
|
|
||||||
password: postgres
|
|
||||||
pool_size: 20
|
|
||||||
```
|
|
||||||
|
|
||||||
## Setup Process
|
|
||||||
|
|
||||||
### Step 1: Run Setup Script
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python scripts/setup_unified_storage.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This script will:
|
|
||||||
1. Connect to the database
|
|
||||||
2. Verify TimescaleDB extension
|
|
||||||
3. Create all required tables
|
|
||||||
4. Convert tables to hypertables
|
|
||||||
5. Create indexes for performance
|
|
||||||
6. Set up continuous aggregates
|
|
||||||
7. Configure compression policies
|
|
||||||
8. Configure retention policies
|
|
||||||
9. Verify the setup
|
|
||||||
10. Run basic operation tests
|
|
||||||
|
|
||||||
### Step 2: Verify Setup
|
|
||||||
|
|
||||||
The setup script will display schema information:
|
|
||||||
|
|
||||||
```
|
|
||||||
=== Schema Information ===
|
|
||||||
Migrations applied: 8
|
|
||||||
Tables created: 5
|
|
||||||
Hypertables: 5
|
|
||||||
Continuous aggregates: 5
|
|
||||||
|
|
||||||
=== Table Sizes ===
|
|
||||||
ohlcv_data: 8192 bytes
|
|
||||||
order_book_snapshots: 8192 bytes
|
|
||||||
order_book_1s_agg: 8192 bytes
|
|
||||||
order_book_imbalances: 8192 bytes
|
|
||||||
trade_events: 8192 bytes
|
|
||||||
|
|
||||||
=== Hypertables ===
|
|
||||||
ohlcv_data: 0 chunks, compression=enabled
|
|
||||||
order_book_snapshots: 0 chunks, compression=enabled
|
|
||||||
order_book_1s_agg: 0 chunks, compression=enabled
|
|
||||||
order_book_imbalances: 0 chunks, compression=enabled
|
|
||||||
trade_events: 0 chunks, compression=enabled
|
|
||||||
|
|
||||||
=== Continuous Aggregates ===
|
|
||||||
ohlcv_1m_continuous: 8192 bytes
|
|
||||||
ohlcv_5m_continuous: 8192 bytes
|
|
||||||
ohlcv_15m_continuous: 8192 bytes
|
|
||||||
ohlcv_1h_continuous: 8192 bytes
|
|
||||||
ohlcv_1d_continuous: 8192 bytes
|
|
||||||
```
|
|
||||||
|
|
||||||
## Database Schema
|
|
||||||
|
|
||||||
### Tables
|
|
||||||
|
|
||||||
#### 1. ohlcv_data
|
|
||||||
Stores candlestick data for all timeframes with pre-calculated technical indicators.
|
|
||||||
|
|
||||||
**Columns:**
|
|
||||||
- `timestamp` (TIMESTAMPTZ): Candle timestamp
|
|
||||||
- `symbol` (VARCHAR): Trading pair (e.g., 'ETH/USDT')
|
|
||||||
- `timeframe` (VARCHAR): Timeframe (1s, 1m, 5m, 15m, 1h, 1d)
|
|
||||||
- `open_price`, `high_price`, `low_price`, `close_price` (DECIMAL): OHLC prices
|
|
||||||
- `volume` (DECIMAL): Trading volume
|
|
||||||
- `trade_count` (INTEGER): Number of trades
|
|
||||||
- Technical indicators: `rsi_14`, `macd`, `macd_signal`, `bb_upper`, `bb_middle`, `bb_lower`, etc.
|
|
||||||
|
|
||||||
**Primary Key:** `(timestamp, symbol, timeframe)`
|
|
||||||
|
|
||||||
#### 2. order_book_snapshots
|
|
||||||
Stores raw order book snapshots.
|
|
||||||
|
|
||||||
**Columns:**
|
|
||||||
- `timestamp` (TIMESTAMPTZ): Snapshot timestamp
|
|
||||||
- `symbol` (VARCHAR): Trading pair
|
|
||||||
- `exchange` (VARCHAR): Exchange name
|
|
||||||
- `bids` (JSONB): Bid levels (top 50)
|
|
||||||
- `asks` (JSONB): Ask levels (top 50)
|
|
||||||
- `mid_price`, `spread`, `bid_volume`, `ask_volume` (DECIMAL): Calculated metrics
|
|
||||||
|
|
||||||
**Primary Key:** `(timestamp, symbol, exchange)`
|
|
||||||
|
|
||||||
#### 3. order_book_1s_agg
|
|
||||||
Stores 1-second aggregated order book data with $1 price buckets.
|
|
||||||
|
|
||||||
**Columns:**
|
|
||||||
- `timestamp` (TIMESTAMPTZ): Aggregation timestamp
|
|
||||||
- `symbol` (VARCHAR): Trading pair
|
|
||||||
- `price_bucket` (DECIMAL): Price bucket ($1 increments)
|
|
||||||
- `bid_volume`, `ask_volume` (DECIMAL): Aggregated volumes
|
|
||||||
- `bid_count`, `ask_count` (INTEGER): Number of orders
|
|
||||||
- `imbalance` (DECIMAL): Order book imbalance
|
|
||||||
|
|
||||||
**Primary Key:** `(timestamp, symbol, price_bucket)`
|
|
||||||
|
|
||||||
#### 4. order_book_imbalances
|
|
||||||
Stores multi-timeframe order book imbalance metrics.
|
|
||||||
|
|
||||||
**Columns:**
|
|
||||||
- `timestamp` (TIMESTAMPTZ): Calculation timestamp
|
|
||||||
- `symbol` (VARCHAR): Trading pair
|
|
||||||
- `imbalance_1s`, `imbalance_5s`, `imbalance_15s`, `imbalance_60s` (DECIMAL): Imbalances
|
|
||||||
- `volume_imbalance_1s`, `volume_imbalance_5s`, etc. (DECIMAL): Volume-weighted imbalances
|
|
||||||
- `price_range` (DECIMAL): Price range used for calculation
|
|
||||||
|
|
||||||
**Primary Key:** `(timestamp, symbol)`
|
|
||||||
|
|
||||||
#### 5. trade_events
|
|
||||||
Stores individual trade events.
|
|
||||||
|
|
||||||
**Columns:**
|
|
||||||
- `timestamp` (TIMESTAMPTZ): Trade timestamp
|
|
||||||
- `symbol` (VARCHAR): Trading pair
|
|
||||||
- `exchange` (VARCHAR): Exchange name
|
|
||||||
- `price` (DECIMAL): Trade price
|
|
||||||
- `size` (DECIMAL): Trade size
|
|
||||||
- `side` (VARCHAR): Trade side ('buy' or 'sell')
|
|
||||||
- `trade_id` (VARCHAR): Unique trade identifier
|
|
||||||
|
|
||||||
**Primary Key:** `(timestamp, symbol, exchange, trade_id)`
|
|
||||||
|
|
||||||
### Continuous Aggregates
|
|
||||||
|
|
||||||
Continuous aggregates automatically generate higher timeframe data from lower timeframes:
|
|
||||||
|
|
||||||
1. **ohlcv_1m_continuous**: 1-minute candles from 1-second data
|
|
||||||
2. **ohlcv_5m_continuous**: 5-minute candles from 1-minute data
|
|
||||||
3. **ohlcv_15m_continuous**: 15-minute candles from 5-minute data
|
|
||||||
4. **ohlcv_1h_continuous**: 1-hour candles from 15-minute data
|
|
||||||
5. **ohlcv_1d_continuous**: 1-day candles from 1-hour data
|
|
||||||
|
|
||||||
### Compression Policies
|
|
||||||
|
|
||||||
Data is automatically compressed to save storage:
|
|
||||||
|
|
||||||
- **ohlcv_data**: Compress after 7 days
|
|
||||||
- **order_book_snapshots**: Compress after 1 day
|
|
||||||
- **order_book_1s_agg**: Compress after 2 days
|
|
||||||
- **order_book_imbalances**: Compress after 2 days
|
|
||||||
- **trade_events**: Compress after 7 days
|
|
||||||
|
|
||||||
Expected compression ratio: **>80%**
|
|
||||||
|
|
||||||
### Retention Policies
|
|
||||||
|
|
||||||
Old data is automatically deleted:
|
|
||||||
|
|
||||||
- **ohlcv_data**: Retain for 2 years
|
|
||||||
- **order_book_snapshots**: Retain for 30 days
|
|
||||||
- **order_book_1s_agg**: Retain for 60 days
|
|
||||||
- **order_book_imbalances**: Retain for 60 days
|
|
||||||
- **trade_events**: Retain for 90 days
|
|
||||||
|
|
||||||
## Performance Optimization
|
|
||||||
|
|
||||||
### Indexes
|
|
||||||
|
|
||||||
All tables have optimized indexes for common query patterns:
|
|
||||||
|
|
||||||
- Symbol + timestamp queries
|
|
||||||
- Timeframe-specific queries
|
|
||||||
- Exchange-specific queries
|
|
||||||
- Multi-column composite indexes
|
|
||||||
|
|
||||||
### Query Performance Targets
|
|
||||||
|
|
||||||
- **Cache reads**: <10ms
|
|
||||||
- **Single timestamp queries**: <100ms
|
|
||||||
- **Time range queries (1 hour)**: <500ms
|
|
||||||
- **Ingestion throughput**: >1000 ops/sec
|
|
||||||
|
|
||||||
### Best Practices
|
|
||||||
|
|
||||||
1. **Use time_bucket for aggregations**:
|
|
||||||
```sql
|
|
||||||
SELECT time_bucket('1 minute', timestamp) AS bucket,
|
|
||||||
symbol,
|
|
||||||
avg(close_price) AS avg_price
|
|
||||||
FROM ohlcv_data
|
|
||||||
WHERE symbol = 'ETH/USDT'
|
|
||||||
AND timestamp >= NOW() - INTERVAL '1 hour'
|
|
||||||
GROUP BY bucket, symbol;
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Query specific timeframes**:
|
|
||||||
```sql
|
|
||||||
SELECT * FROM ohlcv_data
|
|
||||||
WHERE symbol = 'ETH/USDT'
|
|
||||||
AND timeframe = '1m'
|
|
||||||
AND timestamp >= NOW() - INTERVAL '1 day'
|
|
||||||
ORDER BY timestamp DESC;
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Use continuous aggregates for historical data**:
|
|
||||||
```sql
|
|
||||||
SELECT * FROM ohlcv_1h_continuous
|
|
||||||
WHERE symbol = 'ETH/USDT'
|
|
||||||
AND timestamp >= NOW() - INTERVAL '7 days'
|
|
||||||
ORDER BY timestamp DESC;
|
|
||||||
```
|
|
||||||
|
|
||||||
## Monitoring
|
|
||||||
|
|
||||||
### Check Database Size
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT
|
|
||||||
hypertable_name,
|
|
||||||
pg_size_pretty(total_bytes) AS total_size,
|
|
||||||
pg_size_pretty(compressed_total_bytes) AS compressed_size,
|
|
||||||
ROUND((1 - compressed_total_bytes::numeric / total_bytes::numeric) * 100, 2) AS compression_ratio
|
|
||||||
FROM timescaledb_information.hypertables
|
|
||||||
WHERE hypertable_schema = 'public';
|
|
||||||
```
|
|
||||||
|
|
||||||
### Check Chunk Information
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT
|
|
||||||
hypertable_name,
|
|
||||||
num_chunks,
|
|
||||||
num_compressed_chunks,
|
|
||||||
compression_enabled
|
|
||||||
FROM timescaledb_information.hypertables
|
|
||||||
WHERE hypertable_schema = 'public';
|
|
||||||
```
|
|
||||||
|
|
||||||
### Check Continuous Aggregate Status
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT
|
|
||||||
view_name,
|
|
||||||
materialization_hypertable_name,
|
|
||||||
pg_size_pretty(total_bytes) AS size
|
|
||||||
FROM timescaledb_information.continuous_aggregates
|
|
||||||
WHERE view_schema = 'public';
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### TimescaleDB Extension Not Found
|
|
||||||
|
|
||||||
If you see "TimescaleDB extension not found":
|
|
||||||
|
|
||||||
1. Ensure TimescaleDB is installed
|
|
||||||
2. Connect to database and run: `CREATE EXTENSION timescaledb;`
|
|
||||||
3. Restart the setup script
|
|
||||||
|
|
||||||
### Connection Refused
|
|
||||||
|
|
||||||
If you see "connection refused":
|
|
||||||
|
|
||||||
1. Check PostgreSQL is running: `pg_isready`
|
|
||||||
2. Verify connection details in `config.yaml`
|
|
||||||
3. Check firewall settings
|
|
||||||
|
|
||||||
### Permission Denied
|
|
||||||
|
|
||||||
If you see "permission denied":
|
|
||||||
|
|
||||||
1. Ensure database user has CREATE privileges
|
|
||||||
2. Grant privileges: `GRANT ALL PRIVILEGES ON DATABASE trading_data TO postgres;`
|
|
||||||
|
|
||||||
### Slow Queries
|
|
||||||
|
|
||||||
If queries are slow:
|
|
||||||
|
|
||||||
1. Check if indexes exist: `\di` in psql
|
|
||||||
2. Analyze query plan: `EXPLAIN ANALYZE <your query>`
|
|
||||||
3. Ensure compression is enabled
|
|
||||||
4. Consider adding more specific indexes
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
After setup is complete:
|
|
||||||
|
|
||||||
1. **Implement data models** (Task 2)
|
|
||||||
2. **Implement cache layer** (Task 3)
|
|
||||||
3. **Implement database connection layer** (Task 4)
|
|
||||||
4. **Start data migration** from Parquet files (Task 7)
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
For issues or questions:
|
|
||||||
- Check TimescaleDB docs: https://docs.timescale.com/
|
|
||||||
- Review PostgreSQL logs: `tail -f /var/log/postgresql/postgresql-*.log`
|
|
||||||
- Enable debug logging in setup script
|
|
||||||
308
docs/main/MODEL_INPUTS_OUTPUTS_REFERENCE.md
Normal file
308
docs/main/MODEL_INPUTS_OUTPUTS_REFERENCE.md
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
# Model Inputs & Outputs Reference
|
||||||
|
|
||||||
|
Quick reference for all trading models in the system.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Transformer (AdvancedTradingTransformer)
|
||||||
|
|
||||||
|
**Type**: Sequence-to-sequence transformer for multi-timeframe analysis
|
||||||
|
**Size**: 46M parameters
|
||||||
|
**Architecture**: 12 layers, 16 attention heads, 1024 model dimension
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
```python
|
||||||
|
price_data: [batch, 150, 5] # OHLCV sequences (150 candles)
|
||||||
|
cob_data: [batch, 150, 100] # Change of Bid features
|
||||||
|
tech_data: [batch, 40] # Technical indicators (SMA, returns, volatility)
|
||||||
|
market_data: [batch, 30] # Market context (volume, pivots, support/resistance)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Outputs
|
||||||
|
```python
|
||||||
|
action_logits: [batch, 3] # Raw logits for BUY(1), SELL(2), HOLD(0)
|
||||||
|
action_probs: [batch, 3] # Softmax probabilities
|
||||||
|
confidence: [batch, 1] # Trade confidence (0-1)
|
||||||
|
price_prediction: [batch, 1] # Future price target
|
||||||
|
volatility_prediction:[batch, 1] # Expected volatility
|
||||||
|
trend_strength: [batch, 1] # Trend strength (-1 to 1)
|
||||||
|
|
||||||
|
# Next candle predictions for each timeframe
|
||||||
|
next_candles: {
|
||||||
|
'1s': [batch, 5], # [open, high, low, close, volume]
|
||||||
|
'1m': [batch, 5],
|
||||||
|
'1h': [batch, 5],
|
||||||
|
'1d': [batch, 5]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pivot point predictions (L1-L5)
|
||||||
|
next_pivots: {
|
||||||
|
'L1': {
|
||||||
|
'price': [batch, 1],
|
||||||
|
'type_prob_high': [batch, 1], # Probability of high pivot
|
||||||
|
'type_prob_low': [batch, 1], # Probability of low pivot
|
||||||
|
'confidence': [batch, 1]
|
||||||
|
},
|
||||||
|
# ... L2, L3, L4, L5 (same structure)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Trend vector analysis
|
||||||
|
trend_analysis: {
|
||||||
|
'angle_radians': [batch, 1], # Trend angle
|
||||||
|
'steepness': [batch, 1], # Trend steepness
|
||||||
|
'direction': [batch, 1] # Direction (-1 to 1)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Targets
|
||||||
|
```python
|
||||||
|
actions: [batch] # Action labels (0=HOLD, 1=BUY, 2=SELL)
|
||||||
|
future_prices: [batch] # Price targets
|
||||||
|
trade_success: [batch, 1] # Success labels (0.0 or 1.0)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. CNN (StandardizedCNN / EnhancedCNN)
|
||||||
|
|
||||||
|
**Type**: Convolutional neural network for pattern recognition
|
||||||
|
**Size**: ~5-10M parameters
|
||||||
|
**Architecture**: Multi-scale convolutions with attention
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
```python
|
||||||
|
# Via BaseDataInput.get_feature_vector()
|
||||||
|
feature_vector: [batch, 7834] # Flattened features containing:
|
||||||
|
- OHLCV ETH: 300 frames × 4 timeframes × 5 = 6000
|
||||||
|
- OHLCV BTC: 300 frames × 5 = 1500
|
||||||
|
- COB features: 184 (±20 buckets + MA imbalance)
|
||||||
|
- Technical indicators: 100 (padded)
|
||||||
|
- Last predictions: 50 (padded)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Outputs
|
||||||
|
```python
|
||||||
|
action_logits: [batch, 3] # BUY, SELL, HOLD logits
|
||||||
|
action_probs: [batch, 3] # Softmax probabilities
|
||||||
|
confidence: [batch, 1] # Prediction confidence
|
||||||
|
hidden_states: [batch, 1024] # Feature embeddings (for cross-model feeding)
|
||||||
|
predicted_returns: [batch, 4] # [return_1s, return_1m, return_1h, return_1d]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Targets
|
||||||
|
```python
|
||||||
|
actions: [batch] # Action labels (0=HOLD, 1=BUY, 2=SELL)
|
||||||
|
returns: [batch, 4] # Actual returns per timeframe
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. DQN (Deep Q-Network Agent)
|
||||||
|
|
||||||
|
**Type**: Reinforcement learning agent for sequential decision making
|
||||||
|
**Size**: ~15M parameters
|
||||||
|
**Architecture**: Deep Q-Network with dueling architecture
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
```python
|
||||||
|
# Via BaseDataInput.get_feature_vector()
|
||||||
|
state: [batch, 7850] # Full feature vector including:
|
||||||
|
- Multi-timeframe OHLCV data
|
||||||
|
- COB features
|
||||||
|
- Technical indicators
|
||||||
|
- Market regime indicators
|
||||||
|
- Previous predictions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Outputs
|
||||||
|
```python
|
||||||
|
q_values: [batch, 3] # Q-values for BUY, SELL, HOLD
|
||||||
|
action: int # Selected action (0, 1, 2)
|
||||||
|
confidence: float # Action confidence (0-1)
|
||||||
|
|
||||||
|
# Auxiliary outputs
|
||||||
|
regime_probs: [batch, 4] # [trending, ranging, volatile, mixed]
|
||||||
|
price_direction:[batch, 3] # [down, neutral, up]
|
||||||
|
volatility: [batch, 1] # Predicted volatility
|
||||||
|
value: [batch, 1] # State value (V)
|
||||||
|
advantage: [batch, 3] # Action advantages (A)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Targets
|
||||||
|
```python
|
||||||
|
# RL uses experience replay
|
||||||
|
experience: {
|
||||||
|
'state': [7850],
|
||||||
|
'action': int,
|
||||||
|
'reward': float,
|
||||||
|
'next_state': [7850],
|
||||||
|
'done': bool
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. COB RL Model (MassiveRLNetwork)
|
||||||
|
|
||||||
|
**Type**: Specialized RL for Change of Bid (COB) data
|
||||||
|
**Size**: ~3M parameters
|
||||||
|
**Architecture**: Deep network focused on order book dynamics
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
```python
|
||||||
|
cob_features: [batch, input_size] # COB-specific features:
|
||||||
|
- Bid/ask imbalance
|
||||||
|
- Order book depth
|
||||||
|
- Price level changes
|
||||||
|
- Volume at price levels
|
||||||
|
- Moving averages of imbalance
|
||||||
|
```
|
||||||
|
|
||||||
|
### Outputs
|
||||||
|
```python
|
||||||
|
price_logits: [batch, 3] # Direction logits [DOWN, SIDEWAYS, UP]
|
||||||
|
price_probs: [batch, 3] # Direction probabilities
|
||||||
|
confidence: [batch, 1] # Prediction confidence
|
||||||
|
value: [batch, 1] # State value estimate
|
||||||
|
predicted_direction: int # 0=DOWN, 1=SIDEWAYS, 2=UP
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Targets
|
||||||
|
```python
|
||||||
|
targets: {
|
||||||
|
'direction': [batch], # Direction labels (0, 1, 2)
|
||||||
|
'value': [batch], # Value targets
|
||||||
|
'confidence': [batch] # Confidence targets
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Extrema Trainer
|
||||||
|
|
||||||
|
**Type**: Pivot point detection and prediction
|
||||||
|
**Size**: ~1M parameters (lightweight)
|
||||||
|
**Architecture**: Statistical + ML hybrid
|
||||||
|
|
||||||
|
### Inputs
|
||||||
|
```python
|
||||||
|
# Context data (200 candles)
|
||||||
|
context: {
|
||||||
|
'symbol': str,
|
||||||
|
'candles': deque[200], # Recent OHLCV candles
|
||||||
|
'features': array, # Extracted features
|
||||||
|
'last_update': datetime
|
||||||
|
}
|
||||||
|
|
||||||
|
# For prediction
|
||||||
|
current_price: float
|
||||||
|
now: datetime
|
||||||
|
```
|
||||||
|
|
||||||
|
### Outputs
|
||||||
|
```python
|
||||||
|
# Detected extrema
|
||||||
|
extrema: {
|
||||||
|
'type': str, # 'high' or 'low'
|
||||||
|
'price': float,
|
||||||
|
'timestamp': datetime,
|
||||||
|
'confidence': float, # 0-1
|
||||||
|
'window_size': int
|
||||||
|
}
|
||||||
|
|
||||||
|
# Predicted pivot
|
||||||
|
predicted_pivot: {
|
||||||
|
'type': str, # 'high' or 'low'
|
||||||
|
'price': float, # Predicted price level
|
||||||
|
'timestamp': datetime, # Predicted time
|
||||||
|
'confidence': float, # 0-1
|
||||||
|
'horizon_seconds': int # Time until pivot (30-300s)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Data
|
||||||
|
```python
|
||||||
|
# Historical extrema for validation
|
||||||
|
historical_extrema: List[{
|
||||||
|
'price': float,
|
||||||
|
'timestamp': datetime,
|
||||||
|
'type': str,
|
||||||
|
'detected': bool
|
||||||
|
}]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Patterns
|
||||||
|
|
||||||
|
### Action Encoding (All Models)
|
||||||
|
```python
|
||||||
|
0 = HOLD # No action / maintain position
|
||||||
|
1 = BUY # Enter long / close short
|
||||||
|
2 = SELL # Enter short / close long
|
||||||
|
```
|
||||||
|
|
||||||
|
### Confidence Scores
|
||||||
|
- Range: `0.0` to `1.0`
|
||||||
|
- Typical threshold: `0.6` (60%)
|
||||||
|
- High confidence: `> 0.8`
|
||||||
|
- Low confidence: `< 0.4`
|
||||||
|
|
||||||
|
### Batch Sizes
|
||||||
|
- **Training**: Usually `1` (annotation-based) or `32-128` (batch training)
|
||||||
|
- **Inference**: Usually `1` (real-time prediction)
|
||||||
|
|
||||||
|
### Device Management
|
||||||
|
All models support:
|
||||||
|
- CPU: `torch.device('cpu')`
|
||||||
|
- CUDA: `torch.device('cuda')`
|
||||||
|
- Automatic device selection based on availability
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Selection Guide
|
||||||
|
|
||||||
|
| Use Case | Recommended Model | Why |
|
||||||
|
|----------|------------------|-----|
|
||||||
|
| Multi-timeframe analysis | **Transformer** | Handles 150-candle sequences across timeframes |
|
||||||
|
| Pattern recognition | **CNN** | Excellent at visual pattern detection |
|
||||||
|
| Sequential decisions | **DQN** | Learns optimal action sequences via RL |
|
||||||
|
| Order book dynamics | **COB RL** | Specialized for bid/ask imbalance |
|
||||||
|
| Pivot detection | **Extrema** | Lightweight, fast pivot predictions |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get base data input
|
||||||
|
base_input = data_provider.get_base_data_input(symbol, timestamp)
|
||||||
|
|
||||||
|
# CNN prediction
|
||||||
|
cnn_features = base_input.get_feature_vector()
|
||||||
|
cnn_output = cnn_model(cnn_features)
|
||||||
|
cnn_action = torch.argmax(cnn_output['action_probs'])
|
||||||
|
|
||||||
|
# Transformer prediction
|
||||||
|
transformer_batch = prepare_transformer_batch(base_input)
|
||||||
|
transformer_output = transformer_model(**transformer_batch)
|
||||||
|
transformer_action = torch.argmax(transformer_output['action_probs'])
|
||||||
|
|
||||||
|
# DQN prediction
|
||||||
|
dqn_state = base_input.get_feature_vector()
|
||||||
|
dqn_output = dqn_agent.select_action(dqn_state)
|
||||||
|
dqn_action = dqn_output['action']
|
||||||
|
|
||||||
|
# Ensemble decision
|
||||||
|
final_action = majority_vote([cnn_action, transformer_action, dqn_action])
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
1. **Shape Conventions**: `[batch, ...]` indicates batch dimension first
|
||||||
|
2. **Dtype**: All tensors use `torch.float32` unless specified
|
||||||
|
3. **Gradients**: Only training targets require gradients
|
||||||
|
4. **Normalization**: Features are typically normalized to `[-1, 1]` or `[0, 1]`
|
||||||
|
5. **Missing Data**: Padded with zeros or last known values
|
||||||
Reference in New Issue
Block a user