uni data storage

This commit is contained in:
Dobromir Popov
2025-10-20 09:48:59 +03:00
parent 002d0f7858
commit f464a412dc
12 changed files with 2905 additions and 181 deletions

View File

@@ -0,0 +1,860 @@
# Design Document: Unified Data Storage System
## Overview
This design document outlines the architecture for unifying all data storage and retrieval methods in the trading system. The current system uses multiple fragmented approaches (Parquet files, pickle files, in-memory caches, and TimescaleDB) which creates complexity and inconsistency. The unified system will consolidate these into a single, efficient TimescaleDB-based storage backend with a clean, unified API.
### Key Design Principles
1. **Single Source of Truth**: TimescaleDB as the primary storage backend for all time-series data
2. **Unified Interface**: One method (`get_inference_data()`) for all data retrieval needs
3. **Performance First**: In-memory caching for real-time data, optimized queries for historical data
4. **Backward Compatibility**: Seamless migration from existing storage formats
5. **Separation of Concerns**: Clear boundaries between storage, caching, and business logic
## Architecture
### High-Level Architecture
```
┌─────────────────────────────────────────────────────────────┐
│ Application Layer │
│ (Models, Backtesting, Annotation, Dashboard) │
└────────────────────┬────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ Unified Data Provider API │
│ │
│ get_inference_data(symbol, timestamp=None, context_window) │
│ get_multi_timeframe_data(symbol, timeframes, timestamp) │
│ get_order_book_data(symbol, timestamp, aggregation) │
└────────────────────┬────────────────────────────────────────┘
┌────────────┴────────────┐
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ Cache Layer │ │ Storage Layer │
│ (In-Memory) │ │ (TimescaleDB) │
│ │ │ │
│ - Last 5 min │ │ - OHLCV Data │
│ - Real-time │ │ - Order Book │
│ - Low latency │ │ - Trade Data │
└──────────────────┘ │ - Aggregations │
└──────────────────┘
```
### Data Flow
```
Real-Time Data Flow:
WebSocket → Tick Aggregator → Cache Layer → TimescaleDB (async)
Application (fast read)
Historical Data Flow:
Application → Unified API → TimescaleDB → Cache (optional) → Application
```
## Components and Interfaces
### 1. Unified Data Provider
The central component that provides a single interface for all data access.
```python
class UnifiedDataProvider:
"""
Unified interface for all market data access.
Handles both real-time and historical data retrieval.
"""
def __init__(self, db_connection_pool, cache_manager):
self.db = db_connection_pool
self.cache = cache_manager
self.symbols = ['ETH/USDT', 'BTC/USDT']
self.timeframes = ['1s', '1m', '5m', '15m', '1h', '1d']
async def get_inference_data(
self,
symbol: str,
timestamp: Optional[datetime] = None,
context_window_minutes: int = 5
) -> InferenceDataFrame:
"""
Get complete inference data for a symbol at a specific time.
Args:
symbol: Trading symbol (e.g., 'ETH/USDT')
timestamp: Target timestamp (None = latest real-time data)
context_window_minutes: Minutes of context data before/after timestamp
Returns:
InferenceDataFrame with OHLCV, indicators, COB data, imbalances
"""
async def get_multi_timeframe_data(
self,
symbol: str,
timeframes: List[str],
timestamp: Optional[datetime] = None,
limit: int = 100
) -> Dict[str, pd.DataFrame]:
"""
Get aligned multi-timeframe candlestick data.
Args:
symbol: Trading symbol
timeframes: List of timeframes to retrieve
timestamp: Target timestamp (None = latest)
limit: Number of candles per timeframe
Returns:
Dictionary mapping timeframe to DataFrame
"""
async def get_order_book_data(
self,
symbol: str,
timestamp: Optional[datetime] = None,
aggregation: str = '1s',
limit: int = 300
) -> OrderBookDataFrame:
"""
Get order book data with imbalance metrics.
Args:
symbol: Trading symbol
timestamp: Target timestamp (None = latest)
aggregation: Aggregation level ('raw', '1s', '1m')
limit: Number of data points
Returns:
OrderBookDataFrame with bids, asks, imbalances
"""
```
### 2. Storage Layer (TimescaleDB)
TimescaleDB schema and access patterns.
#### Database Schema
```sql
-- OHLCV Data (Hypertable)
CREATE TABLE ohlcv_data (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
timeframe VARCHAR(10) NOT NULL,
open_price DECIMAL(20,8) NOT NULL,
high_price DECIMAL(20,8) NOT NULL,
low_price DECIMAL(20,8) NOT NULL,
close_price DECIMAL(20,8) NOT NULL,
volume DECIMAL(30,8) NOT NULL,
trade_count INTEGER,
-- Technical Indicators (pre-calculated)
rsi_14 DECIMAL(10,4),
macd DECIMAL(20,8),
macd_signal DECIMAL(20,8),
bb_upper DECIMAL(20,8),
bb_middle DECIMAL(20,8),
bb_lower DECIMAL(20,8),
PRIMARY KEY (timestamp, symbol, timeframe)
);
SELECT create_hypertable('ohlcv_data', 'timestamp');
CREATE INDEX idx_ohlcv_symbol_tf ON ohlcv_data (symbol, timeframe, timestamp DESC);
-- Order Book Snapshots (Hypertable)
CREATE TABLE order_book_snapshots (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
exchange VARCHAR(20) NOT NULL,
bids JSONB NOT NULL, -- Top 50 levels
asks JSONB NOT NULL, -- Top 50 levels
mid_price DECIMAL(20,8),
spread DECIMAL(20,8),
bid_volume DECIMAL(30,8),
ask_volume DECIMAL(30,8),
PRIMARY KEY (timestamp, symbol, exchange)
);
SELECT create_hypertable('order_book_snapshots', 'timestamp');
CREATE INDEX idx_obs_symbol ON order_book_snapshots (symbol, timestamp DESC);
-- Order Book Aggregated 1s (Hypertable)
CREATE TABLE order_book_1s_agg (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
price_bucket DECIMAL(20,2) NOT NULL, -- $1 buckets
bid_volume DECIMAL(30,8),
ask_volume DECIMAL(30,8),
bid_count INTEGER,
ask_count INTEGER,
imbalance DECIMAL(10,6),
PRIMARY KEY (timestamp, symbol, price_bucket)
);
SELECT create_hypertable('order_book_1s_agg', 'timestamp');
CREATE INDEX idx_ob1s_symbol ON order_book_1s_agg (symbol, timestamp DESC);
-- Order Book Imbalances (Hypertable)
CREATE TABLE order_book_imbalances (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
imbalance_1s DECIMAL(10,6),
imbalance_5s DECIMAL(10,6),
imbalance_15s DECIMAL(10,6),
imbalance_60s DECIMAL(10,6),
volume_imbalance_1s DECIMAL(10,6),
volume_imbalance_5s DECIMAL(10,6),
volume_imbalance_15s DECIMAL(10,6),
volume_imbalance_60s DECIMAL(10,6),
price_range DECIMAL(10,2),
PRIMARY KEY (timestamp, symbol)
);
SELECT create_hypertable('order_book_imbalances', 'timestamp');
CREATE INDEX idx_obi_symbol ON order_book_imbalances (symbol, timestamp DESC);
-- Trade Events (Hypertable)
CREATE TABLE trade_events (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
exchange VARCHAR(20) NOT NULL,
price DECIMAL(20,8) NOT NULL,
size DECIMAL(30,8) NOT NULL,
side VARCHAR(4) NOT NULL,
trade_id VARCHAR(100) NOT NULL,
PRIMARY KEY (timestamp, symbol, exchange, trade_id)
);
SELECT create_hypertable('trade_events', 'timestamp');
CREATE INDEX idx_trades_symbol ON trade_events (symbol, timestamp DESC);
```
#### Continuous Aggregates
```sql
-- 1m OHLCV from 1s data
CREATE MATERIALIZED VIEW ohlcv_1m_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('1 minute', timestamp) AS timestamp,
symbol,
'1m' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '1s'
GROUP BY time_bucket('1 minute', timestamp), symbol;
-- 5m OHLCV from 1m data
CREATE MATERIALIZED VIEW ohlcv_5m_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('5 minutes', timestamp) AS timestamp,
symbol,
'5m' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '1m'
GROUP BY time_bucket('5 minutes', timestamp), symbol;
-- Similar for 15m, 1h, 1d
```
#### Compression Policies
```sql
-- Compress data older than 7 days
SELECT add_compression_policy('ohlcv_data', INTERVAL '7 days');
SELECT add_compression_policy('order_book_snapshots', INTERVAL '1 day');
SELECT add_compression_policy('order_book_1s_agg', INTERVAL '2 days');
SELECT add_compression_policy('order_book_imbalances', INTERVAL '2 days');
SELECT add_compression_policy('trade_events', INTERVAL '7 days');
```
#### Retention Policies
```sql
-- Retain data for specified periods
SELECT add_retention_policy('order_book_snapshots', INTERVAL '30 days');
SELECT add_retention_policy('order_book_1s_agg', INTERVAL '60 days');
SELECT add_retention_policy('order_book_imbalances', INTERVAL '60 days');
SELECT add_retention_policy('trade_events', INTERVAL '90 days');
SELECT add_retention_policy('ohlcv_data', INTERVAL '2 years');
```
### 3. Cache Layer
In-memory caching for low-latency real-time data access.
```python
class DataCacheManager:
"""
Manages in-memory cache for real-time data.
Provides <10ms latency for latest data access.
"""
def __init__(self, cache_duration_seconds: int = 300):
# Cache last 5 minutes of data
self.cache_duration = cache_duration_seconds
# In-memory storage
self.ohlcv_cache: Dict[str, Dict[str, deque]] = {}
self.orderbook_cache: Dict[str, deque] = {}
self.imbalance_cache: Dict[str, deque] = {}
self.trade_cache: Dict[str, deque] = {}
# Cache statistics
self.cache_hits = 0
self.cache_misses = 0
def add_ohlcv_candle(self, symbol: str, timeframe: str, candle: Dict):
"""Add OHLCV candle to cache"""
def add_orderbook_snapshot(self, symbol: str, snapshot: Dict):
"""Add order book snapshot to cache"""
def add_imbalance_data(self, symbol: str, imbalance: Dict):
"""Add imbalance metrics to cache"""
def get_latest_ohlcv(self, symbol: str, timeframe: str, limit: int = 100) -> List[Dict]:
"""Get latest OHLCV candles from cache"""
def get_latest_orderbook(self, symbol: str) -> Optional[Dict]:
"""Get latest order book snapshot from cache"""
def get_latest_imbalances(self, symbol: str, limit: int = 60) -> List[Dict]:
"""Get latest imbalance metrics from cache"""
def evict_old_data(self):
"""Remove data older than cache duration"""
```
### 4. Data Models
Standardized data structures for all components.
```python
@dataclass
class InferenceDataFrame:
"""Complete inference data for a single timestamp"""
symbol: str
timestamp: datetime
# Multi-timeframe OHLCV
ohlcv_1s: pd.DataFrame
ohlcv_1m: pd.DataFrame
ohlcv_5m: pd.DataFrame
ohlcv_15m: pd.DataFrame
ohlcv_1h: pd.DataFrame
ohlcv_1d: pd.DataFrame
# Order book data
orderbook_snapshot: Optional[Dict]
orderbook_1s_agg: pd.DataFrame
# Imbalance metrics
imbalances: pd.DataFrame # Multi-timeframe imbalances
# Technical indicators (pre-calculated)
indicators: Dict[str, float]
# Context window data (±N minutes)
context_data: Optional[pd.DataFrame]
# Metadata
data_source: str # 'cache' or 'database'
query_latency_ms: float
@dataclass
class OrderBookDataFrame:
"""Order book data with imbalances"""
symbol: str
timestamp: datetime
# Raw order book
bids: List[Tuple[float, float]] # (price, size)
asks: List[Tuple[float, float]]
# Aggregated data
price_buckets: pd.DataFrame # $1 buckets
# Imbalance metrics
imbalance_1s: float
imbalance_5s: float
imbalance_15s: float
imbalance_60s: float
# Volume-weighted imbalances
volume_imbalance_1s: float
volume_imbalance_5s: float
volume_imbalance_15s: float
volume_imbalance_60s: float
# Statistics
mid_price: float
spread: float
bid_volume: float
ask_volume: float
```
### 5. Data Ingestion Pipeline
Real-time data ingestion with async persistence.
```python
class DataIngestionPipeline:
"""
Handles real-time data ingestion from WebSocket sources.
Writes to cache immediately, persists to DB asynchronously.
"""
def __init__(self, cache_manager, db_connection_pool):
self.cache = cache_manager
self.db = db_connection_pool
# Batch write buffers
self.ohlcv_buffer: List[Dict] = []
self.orderbook_buffer: List[Dict] = []
self.trade_buffer: List[Dict] = []
# Batch write settings
self.batch_size = 100
self.batch_timeout_seconds = 5
async def ingest_ohlcv_candle(self, symbol: str, timeframe: str, candle: Dict):
"""
Ingest OHLCV candle.
1. Add to cache immediately
2. Buffer for batch write to DB
"""
# Immediate cache write
self.cache.add_ohlcv_candle(symbol, timeframe, candle)
# Buffer for DB write
self.ohlcv_buffer.append({
'symbol': symbol,
'timeframe': timeframe,
**candle
})
# Flush if buffer full
if len(self.ohlcv_buffer) >= self.batch_size:
await self._flush_ohlcv_buffer()
async def ingest_orderbook_snapshot(self, symbol: str, snapshot: Dict):
"""Ingest order book snapshot"""
# Immediate cache write
self.cache.add_orderbook_snapshot(symbol, snapshot)
# Calculate and cache imbalances
imbalances = self._calculate_imbalances(symbol, snapshot)
self.cache.add_imbalance_data(symbol, imbalances)
# Buffer for DB write
self.orderbook_buffer.append({
'symbol': symbol,
**snapshot
})
# Flush if buffer full
if len(self.orderbook_buffer) >= self.batch_size:
await self._flush_orderbook_buffer()
async def _flush_ohlcv_buffer(self):
"""Batch write OHLCV data to database"""
if not self.ohlcv_buffer:
return
try:
# Prepare batch insert
values = [
(
item['timestamp'],
item['symbol'],
item['timeframe'],
item['open'],
item['high'],
item['low'],
item['close'],
item['volume'],
item.get('trade_count', 0)
)
for item in self.ohlcv_buffer
]
# Batch insert
await self.db.executemany(
"""
INSERT INTO ohlcv_data
(timestamp, symbol, timeframe, open_price, high_price,
low_price, close_price, volume, trade_count)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (timestamp, symbol, timeframe) DO UPDATE
SET close_price = EXCLUDED.close_price,
high_price = GREATEST(ohlcv_data.high_price, EXCLUDED.high_price),
low_price = LEAST(ohlcv_data.low_price, EXCLUDED.low_price),
volume = ohlcv_data.volume + EXCLUDED.volume,
trade_count = ohlcv_data.trade_count + EXCLUDED.trade_count
""",
values
)
# Clear buffer
self.ohlcv_buffer.clear()
except Exception as e:
logger.error(f"Error flushing OHLCV buffer: {e}")
```
### 6. Migration System
Migrate existing Parquet/pickle data to TimescaleDB.
```python
class DataMigrationManager:
"""
Migrates existing data from Parquet/pickle files to TimescaleDB.
Ensures data integrity and provides rollback capability.
"""
def __init__(self, db_connection_pool, cache_dir: Path):
self.db = db_connection_pool
self.cache_dir = cache_dir
async def migrate_all_data(self):
"""Migrate all existing data to TimescaleDB"""
logger.info("Starting data migration to TimescaleDB")
# Migrate OHLCV data from Parquet files
await self._migrate_ohlcv_data()
# Migrate order book data if exists
await self._migrate_orderbook_data()
# Verify migration
await self._verify_migration()
logger.info("Data migration completed successfully")
async def _migrate_ohlcv_data(self):
"""Migrate OHLCV data from Parquet files"""
parquet_files = list(self.cache_dir.glob("*.parquet"))
for parquet_file in parquet_files:
try:
# Parse filename: ETHUSDT_1m.parquet
filename = parquet_file.stem
parts = filename.split('_')
if len(parts) != 2:
continue
symbol_raw = parts[0]
timeframe = parts[1]
# Convert symbol format
symbol = self._convert_symbol_format(symbol_raw)
# Read Parquet file
df = pd.read_parquet(parquet_file)
# Migrate data in batches
await self._migrate_ohlcv_batch(symbol, timeframe, df)
logger.info(f"Migrated {len(df)} rows from {parquet_file.name}")
except Exception as e:
logger.error(f"Error migrating {parquet_file}: {e}")
async def _migrate_ohlcv_batch(self, symbol: str, timeframe: str, df: pd.DataFrame):
"""Migrate a batch of OHLCV data"""
# Prepare data for insertion
values = []
for idx, row in df.iterrows():
values.append((
row['timestamp'],
symbol,
timeframe,
row['open'],
row['high'],
row['low'],
row['close'],
row['volume'],
row.get('trade_count', 0)
))
# Batch insert
await self.db.executemany(
"""
INSERT INTO ohlcv_data
(timestamp, symbol, timeframe, open_price, high_price,
low_price, close_price, volume, trade_count)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (timestamp, symbol, timeframe) DO NOTHING
""",
values
)
```
## Error Handling
### Data Validation
```python
class DataValidator:
"""Validates all incoming data before storage"""
@staticmethod
def validate_ohlcv(candle: Dict) -> bool:
"""Validate OHLCV candle data"""
try:
# Check required fields
required = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
if not all(field in candle for field in required):
return False
# Validate OHLC relationships
if candle['high'] < candle['low']:
logger.warning(f"Invalid OHLCV: high < low")
return False
if candle['high'] < candle['open'] or candle['high'] < candle['close']:
logger.warning(f"Invalid OHLCV: high < open/close")
return False
if candle['low'] > candle['open'] or candle['low'] > candle['close']:
logger.warning(f"Invalid OHLCV: low > open/close")
return False
# Validate positive volume
if candle['volume'] < 0:
logger.warning(f"Invalid OHLCV: negative volume")
return False
return True
except Exception as e:
logger.error(f"Error validating OHLCV: {e}")
return False
@staticmethod
def validate_orderbook(orderbook: Dict) -> bool:
"""Validate order book data"""
try:
# Check required fields
if 'bids' not in orderbook or 'asks' not in orderbook:
return False
# Validate bid/ask relationship
if orderbook['bids'] and orderbook['asks']:
best_bid = max(bid[0] for bid in orderbook['bids'])
best_ask = min(ask[0] for ask in orderbook['asks'])
if best_bid >= best_ask:
logger.warning(f"Invalid orderbook: bid >= ask")
return False
return True
except Exception as e:
logger.error(f"Error validating orderbook: {e}")
return False
```
### Retry Logic
```python
class RetryableDBOperation:
"""Wrapper for database operations with retry logic"""
@staticmethod
async def execute_with_retry(
operation: Callable,
max_retries: int = 3,
backoff_seconds: float = 1.0
):
"""Execute database operation with exponential backoff retry"""
for attempt in range(max_retries):
try:
return await operation()
except Exception as e:
if attempt == max_retries - 1:
logger.error(f"Operation failed after {max_retries} attempts: {e}")
raise
wait_time = backoff_seconds * (2 ** attempt)
logger.warning(f"Operation failed (attempt {attempt + 1}), retrying in {wait_time}s: {e}")
await asyncio.sleep(wait_time)
```
## Testing Strategy
### Unit Tests
1. **Data Validation Tests**
- Test OHLCV validation logic
- Test order book validation logic
- Test timestamp validation and timezone handling
2. **Cache Manager Tests**
- Test cache insertion and retrieval
- Test cache eviction logic
- Test cache hit/miss statistics
3. **Data Model Tests**
- Test InferenceDataFrame creation
- Test OrderBookDataFrame creation
- Test data serialization/deserialization
### Integration Tests
1. **Database Integration Tests**
- Test TimescaleDB connection and queries
- Test batch insert operations
- Test continuous aggregates
- Test compression and retention policies
2. **End-to-End Data Flow Tests**
- Test real-time data ingestion → cache → database
- Test historical data retrieval from database
- Test multi-timeframe data alignment
3. **Migration Tests**
- Test Parquet file migration
- Test data integrity after migration
- Test rollback capability
### Performance Tests
1. **Latency Tests**
- Cache read latency (<10ms target)
- Database query latency (<100ms target)
- Batch write throughput (>1000 ops/sec target)
2. **Load Tests**
- Concurrent read/write operations
- High-frequency data ingestion
- Large time-range queries
3. **Storage Tests**
- Compression ratio validation (>80% target)
- Storage growth over time
- Query performance with compressed data
## Performance Optimization
### Query Optimization
```sql
-- Use time_bucket for efficient time-range queries
SELECT
time_bucket('1 minute', timestamp) AS bucket,
symbol,
first(close_price, timestamp) AS price
FROM ohlcv_data
WHERE symbol = 'ETH/USDT'
AND timeframe = '1s'
AND timestamp >= NOW() - INTERVAL '1 hour'
GROUP BY bucket, symbol
ORDER BY bucket DESC;
-- Use indexes for symbol-based queries
CREATE INDEX CONCURRENTLY idx_ohlcv_symbol_tf_ts
ON ohlcv_data (symbol, timeframe, timestamp DESC);
```
### Caching Strategy
1. **Hot Data**: Last 5 minutes in memory (all symbols, all timeframes)
2. **Warm Data**: Last 1 hour in TimescaleDB uncompressed
3. **Cold Data**: Older than 1 hour in TimescaleDB compressed
### Batch Operations
- Batch size: 100 records or 5 seconds (whichever comes first)
- Use `executemany()` for bulk inserts
- Use `COPY` command for large migrations
## Deployment Considerations
### Database Setup
1. Install TimescaleDB extension
2. Run schema creation scripts
3. Create hypertables and indexes
4. Set up continuous aggregates
5. Configure compression and retention policies
### Migration Process
1. **Phase 1**: Deploy new code with dual-write (Parquet + TimescaleDB)
2. **Phase 2**: Run migration script to backfill historical data
3. **Phase 3**: Verify data integrity
4. **Phase 4**: Switch reads to TimescaleDB
5. **Phase 5**: Deprecate Parquet writes
6. **Phase 6**: Archive old Parquet files
### Monitoring
1. **Database Metrics**
- Query latency (p50, p95, p99)
- Write throughput
- Storage size and compression ratio
- Connection pool utilization
2. **Cache Metrics**
- Hit/miss ratio
- Cache size
- Eviction rate
3. **Application Metrics**
- Data retrieval latency
- Error rates
- Data validation failures
## Security Considerations
1. **Database Access**
- Use connection pooling with proper credentials
- Implement read-only users for query-only operations
- Use SSL/TLS for database connections
2. **Data Validation**
- Validate all incoming data before storage
- Sanitize inputs to prevent SQL injection
- Implement rate limiting for API endpoints
3. **Backup and Recovery**
- Regular database backups (daily)
- Point-in-time recovery capability
- Disaster recovery plan
## Future Enhancements
1. **Multi-Exchange Support**
- Store data from multiple exchanges
- Cross-exchange arbitrage analysis
- Exchange-specific data normalization
2. **Advanced Analytics**
- Real-time pattern detection
- Anomaly detection
- Predictive analytics
3. **Distributed Storage**
- Horizontal scaling with TimescaleDB clustering
- Read replicas for query load distribution
- Geographic distribution for low-latency access

View File

@@ -0,0 +1,134 @@
# Requirements Document
## Introduction
This feature aims to unify all data storage and retrieval methods across the trading system into a single, coherent interface. Currently, the system uses multiple storage approaches (Parquet files, pickle files, in-memory caches, TimescaleDB) and has fragmented data access patterns. This creates complexity, inconsistency, and performance issues.
The unified data storage system will provide a single endpoint for retrieving inference data, supporting both real-time streaming data and historical backtesting/annotation scenarios. It will consolidate storage methods into the most efficient approach and ensure all components use consistent data access patterns.
## Requirements
### Requirement 1: Unified Data Retrieval Interface
**User Story:** As a developer, I want a single method to retrieve inference data regardless of whether I need real-time or historical data, so that I can simplify my code and ensure consistency.
#### Acceptance Criteria
1. WHEN a component requests inference data THEN the system SHALL provide a unified `get_inference_data()` method that accepts a timestamp parameter
2. WHEN timestamp is None or "latest" THEN the system SHALL return the most recent cached real-time data
3. WHEN timestamp is a specific datetime THEN the system SHALL return historical data from local storage at that timestamp
4. WHEN requesting inference data THEN the system SHALL return data in a standardized format with all required features (OHLCV, technical indicators, COB data, order book imbalances)
5. WHEN the requested timestamp is not available THEN the system SHALL return the nearest available data point with a warning
### Requirement 2: Consolidated Storage Backend
**User Story:** As a system architect, I want all market data stored using a single, optimized storage method, so that I can reduce complexity and improve performance.
#### Acceptance Criteria
1. WHEN storing candlestick data THEN the system SHALL use TimescaleDB as the primary storage backend
2. WHEN storing raw order book ticks THEN the system SHALL use TimescaleDB with appropriate compression
3. WHEN storing aggregated 1s/1m data THEN the system SHALL use TimescaleDB hypertables for efficient time-series queries
4. WHEN the system starts THEN it SHALL migrate existing Parquet and pickle files to TimescaleDB
5. WHEN data is written THEN the system SHALL ensure atomic writes with proper error handling
6. WHEN querying data THEN the system SHALL leverage TimescaleDB's time-series optimizations for fast retrieval
### Requirement 3: Multi-Timeframe Data Storage
**User Story:** As a trading model, I need access to multiple timeframes (1s, 1m, 5m, 15m, 1h, 1d) of candlestick data, so that I can perform multi-timeframe analysis.
#### Acceptance Criteria
1. WHEN storing candlestick data THEN the system SHALL store all configured timeframes (1s, 1m, 5m, 15m, 1h, 1d)
2. WHEN aggregating data THEN the system SHALL use TimescaleDB continuous aggregates to automatically generate higher timeframes from 1s data
3. WHEN requesting multi-timeframe data THEN the system SHALL return aligned timestamps across all timeframes
4. WHEN a timeframe is missing data THEN the system SHALL generate it from lower timeframes if available
5. WHEN storing timeframe data THEN the system SHALL maintain at least 1500 candles per timeframe for each symbol
### Requirement 4: Raw Order Book and Trade Data Storage
**User Story:** As a machine learning model, I need access to raw 1s and 1m aggregated order book and trade book data, so that I can analyze market microstructure.
#### Acceptance Criteria
1. WHEN receiving order book updates THEN the system SHALL store raw ticks in TimescaleDB with full bid/ask depth
2. WHEN aggregating order book data THEN the system SHALL create 1s aggregations with $1 price buckets
3. WHEN aggregating order book data THEN the system SHALL create 1m aggregations with $10 price buckets
4. WHEN storing trade data THEN the system SHALL store individual trades with price, size, side, and timestamp
5. WHEN storing order book data THEN the system SHALL maintain 30 minutes of raw data and 24 hours of aggregated data
6. WHEN querying order book data THEN the system SHALL provide efficient access to imbalance metrics across multiple timeframes (1s, 5s, 15s, 60s)
### Requirement 5: Real-Time Data Caching
**User Story:** As a real-time trading system, I need low-latency access to the latest market data, so that I can make timely trading decisions.
#### Acceptance Criteria
1. WHEN receiving real-time data THEN the system SHALL maintain an in-memory cache of the last 5 minutes of data
2. WHEN requesting latest data THEN the system SHALL serve from cache with <10ms latency
3. WHEN cache is updated THEN the system SHALL asynchronously persist to TimescaleDB without blocking
4. WHEN cache reaches capacity THEN the system SHALL evict oldest data while maintaining continuity
5. WHEN system restarts THEN the system SHALL rebuild cache from TimescaleDB automatically
### Requirement 6: Historical Data Access for Backtesting
**User Story:** As a backtesting system, I need efficient access to historical data at any timestamp, so that I can simulate trading strategies accurately.
#### Acceptance Criteria
1. WHEN requesting historical data THEN the system SHALL query TimescaleDB with timestamp-based indexing
2. WHEN requesting a time range THEN the system SHALL return all data points within that range efficiently
3. WHEN requesting data with context window THEN the system SHALL return ±N minutes of surrounding data
4. WHEN backtesting THEN the system SHALL support sequential data access without loading entire dataset into memory
5. WHEN querying historical data THEN the system SHALL return results in <100ms for typical queries (single timestamp, single symbol)
### Requirement 7: Data Annotation Support
**User Story:** As a data annotator, I need to retrieve historical market data at specific timestamps to manually label trading signals, so that I can create training datasets.
#### Acceptance Criteria
1. WHEN annotating data THEN the system SHALL provide the same `get_inference_data()` interface with timestamp parameter
2. WHEN retrieving annotation data THEN the system SHALL include ±5 minutes of context data
3. WHEN loading annotation sessions THEN the system SHALL support efficient random access to any timestamp
4. WHEN displaying charts THEN the system SHALL provide multi-timeframe data aligned to the annotation timestamp
5. WHEN saving annotations THEN the system SHALL link annotations to exact timestamps in the database
### Requirement 8: Data Migration and Backward Compatibility
**User Story:** As a system administrator, I want existing data migrated to the new storage system without data loss, so that I can maintain historical continuity.
#### Acceptance Criteria
1. WHEN migration starts THEN the system SHALL detect existing Parquet files in cache directory
2. WHEN migrating Parquet data THEN the system SHALL import all data into TimescaleDB with proper timestamps
3. WHEN migration completes THEN the system SHALL verify data integrity by comparing record counts
4. WHEN migration fails THEN the system SHALL rollback changes and preserve original files
5. WHEN migration succeeds THEN the system SHALL optionally archive old Parquet files
6. WHEN accessing data during migration THEN the system SHALL continue serving from existing storage
### Requirement 9: Performance and Scalability
**User Story:** As a system operator, I need the data storage system to handle high-frequency data ingestion and queries efficiently, so that the system remains responsive under load.
#### Acceptance Criteria
1. WHEN ingesting real-time data THEN the system SHALL handle at least 1000 updates per second per symbol
2. WHEN querying data THEN the system SHALL return single-timestamp queries in <100ms
3. WHEN querying time ranges THEN the system SHALL return 1 hour of 1s data in <500ms
4. WHEN storing data THEN the system SHALL use batch writes to optimize database performance
5. WHEN database grows THEN the system SHALL use TimescaleDB compression to reduce storage size by 80%+
6. WHEN running multiple queries THEN the system SHALL support concurrent access without performance degradation
### Requirement 10: Data Consistency and Validation
**User Story:** As a trading system, I need to ensure all data is consistent and validated, so that models receive accurate information.
#### Acceptance Criteria
1. WHEN storing data THEN the system SHALL validate timestamps are in UTC timezone
2. WHEN storing OHLCV data THEN the system SHALL validate high >= low and high >= open/close
3. WHEN storing order book data THEN the system SHALL validate bids < asks
4. WHEN detecting invalid data THEN the system SHALL log warnings and reject the data point
5. WHEN querying data THEN the system SHALL ensure all timeframes are properly aligned
6. WHEN data gaps exist THEN the system SHALL identify and log missing periods

View File

@@ -0,0 +1,286 @@
# Implementation Plan
- [x] 1. Set up TimescaleDB schema and infrastructure
- Create database schema with hypertables for OHLCV, order book, and trade data
- Implement continuous aggregates for multi-timeframe data generation
- Configure compression and retention policies
- Create all necessary indexes for query optimization
- _Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 3.1, 3.2, 3.3, 3.4, 3.5, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6_
- [ ] 2. Implement data models and validation
- [ ] 2.1 Create InferenceDataFrame and OrderBookDataFrame data classes
- Write dataclasses for standardized data structures
- Include all required fields (OHLCV, order book, imbalances, indicators)
- Add serialization/deserialization methods
- _Requirements: 1.4, 10.1, 10.2, 10.3_
- [ ] 2.2 Implement DataValidator class
- Write OHLCV validation logic (high >= low, positive volume)
- Write order book validation logic (bids < asks)
- Write timestamp validation and UTC timezone enforcement
- Add comprehensive error logging for validation failures
- _Requirements: 10.1, 10.2, 10.3, 10.4_
- [ ]* 2.3 Write unit tests for data models and validation
- Test InferenceDataFrame creation and serialization
- Test OrderBookDataFrame creation and serialization
- Test DataValidator with valid and invalid data
- Test edge cases and boundary conditions
- _Requirements: 10.1, 10.2, 10.3, 10.4_
- [ ] 3. Implement cache layer
- [ ] 3.1 Create DataCacheManager class
- Implement in-memory cache with deque structures
- Add methods for OHLCV, order book, and imbalance data
- Implement cache eviction logic (5-minute rolling window)
- Add cache statistics tracking (hits, misses)
- _Requirements: 5.1, 5.2, 5.3, 5.4_
- [ ] 3.2 Implement cache retrieval methods
- Write get_latest_ohlcv() with timeframe support
- Write get_latest_orderbook() for current snapshot
- Write get_latest_imbalances() for multi-timeframe metrics
- Ensure <10ms latency for cache reads
- _Requirements: 5.1, 5.2_
- [ ]* 3.3 Write unit tests for cache layer
- Test cache insertion and retrieval
- Test cache eviction logic
- Test cache statistics
- Test concurrent access patterns
- _Requirements: 5.1, 5.2, 5.3, 5.4_
- [ ] 4. Implement database connection and query layer
- [ ] 4.1 Create DatabaseConnectionManager class
- Implement asyncpg connection pool management
- Add health monitoring and automatic reconnection
- Configure connection pool settings (min/max connections)
- Add connection statistics and logging
- _Requirements: 2.1, 2.5, 9.6_
- [ ] 4.2 Implement OHLCV query methods
- Write query_ohlcv_data() for single timeframe retrieval
- Write query_multi_timeframe_ohlcv() for aligned multi-timeframe data
- Optimize queries with time_bucket and proper indexes
- Ensure <100ms query latency for typical queries
- _Requirements: 3.1, 3.2, 3.3, 3.4, 6.1, 6.2, 6.5, 9.2, 9.3_
- [ ] 4.3 Implement order book query methods
- Write query_orderbook_snapshots() for raw order book data
- Write query_orderbook_aggregated() for 1s/1m aggregations
- Write query_orderbook_imbalances() for multi-timeframe imbalances
- Optimize queries for fast retrieval
- _Requirements: 4.1, 4.2, 4.3, 4.6, 6.1, 6.2, 6.5_
- [ ]* 4.4 Write integration tests for database layer
- Test connection pool management
- Test OHLCV queries with various time ranges
- Test order book queries
- Test query performance and latency
- _Requirements: 6.1, 6.2, 6.5, 9.2, 9.3_
- [ ] 5. Implement data ingestion pipeline
- [ ] 5.1 Create DataIngestionPipeline class
- Implement batch write buffers for OHLCV, order book, and trade data
- Add batch size and timeout configuration
- Implement async batch flush methods
- Add error handling and retry logic
- _Requirements: 2.5, 5.3, 9.1, 9.4_
- [ ] 5.2 Implement OHLCV ingestion
- Write ingest_ohlcv_candle() method
- Add immediate cache write
- Implement batch buffering for database writes
- Add data validation before ingestion
- _Requirements: 2.1, 2.2, 2.5, 5.1, 5.3, 9.1, 9.4, 10.1, 10.2_
- [ ] 5.3 Implement order book ingestion
- Write ingest_orderbook_snapshot() method
- Calculate and cache imbalance metrics
- Implement batch buffering for database writes
- Add data validation before ingestion
- _Requirements: 2.1, 2.2, 4.1, 4.2, 4.3, 5.1, 5.3, 9.1, 9.4, 10.3_
- [ ] 5.4 Implement retry logic and error handling
- Create RetryableDBOperation wrapper class
- Implement exponential backoff retry strategy
- Add comprehensive error logging
- Handle database connection failures gracefully
- _Requirements: 2.5, 9.6_
- [ ]* 5.5 Write integration tests for ingestion pipeline
- Test OHLCV ingestion flow (cache database)
- Test order book ingestion flow
- Test batch write operations
- Test error handling and retry logic
- _Requirements: 2.5, 5.3, 9.1, 9.4_
- [ ] 6. Implement unified data provider API
- [ ] 6.1 Create UnifiedDataProvider class
- Initialize with database connection pool and cache manager
- Configure symbols and timeframes
- Add connection to existing DataProvider components
- _Requirements: 1.1, 1.2, 1.3_
- [ ] 6.2 Implement get_inference_data() method
- Handle timestamp=None for real-time data from cache
- Handle specific timestamp for historical data from database
- Implement context window retrieval N minutes)
- Combine OHLCV, order book, and imbalance data
- Return standardized InferenceDataFrame
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 5.2, 6.1, 6.2, 6.3, 6.4, 7.1, 7.2, 7.3_
- [ ] 6.3 Implement get_multi_timeframe_data() method
- Query multiple timeframes efficiently
- Align timestamps across timeframes
- Handle missing data by generating from lower timeframes
- Return dictionary mapping timeframe to DataFrame
- _Requirements: 3.1, 3.2, 3.3, 3.4, 6.1, 6.2, 6.3, 10.5_
- [ ] 6.4 Implement get_order_book_data() method
- Handle different aggregation levels (raw, 1s, 1m)
- Include multi-timeframe imbalance metrics
- Return standardized OrderBookDataFrame
- _Requirements: 4.1, 4.2, 4.3, 4.6, 6.1, 6.2_
- [ ]* 6.5 Write integration tests for unified API
- Test get_inference_data() with real-time and historical data
- Test get_multi_timeframe_data() with various timeframes
- Test get_order_book_data() with different aggregations
- Test context window retrieval
- Test data consistency across methods
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 6.1, 6.2, 6.3, 6.4, 10.5, 10.6_
- [ ] 7. Implement data migration system
- [ ] 7.1 Create DataMigrationManager class
- Initialize with database connection and cache directory path
- Add methods for discovering existing Parquet files
- Implement symbol format conversion utilities
- _Requirements: 8.1, 8.2, 8.6_
- [ ] 7.2 Implement Parquet file migration
- Write _migrate_ohlcv_data() to process all Parquet files
- Parse filenames to extract symbol and timeframe
- Read Parquet files and convert to database format
- Implement batch insertion with conflict handling
- _Requirements: 8.1, 8.2, 8.3, 8.5_
- [ ] 7.3 Implement migration verification
- Write _verify_migration() to compare record counts
- Check data integrity (no missing timestamps)
- Validate data ranges match original files
- Generate migration report
- _Requirements: 8.3, 8.4_
- [ ] 7.4 Implement rollback capability
- Add transaction support for migration operations
- Implement rollback on verification failure
- Preserve original Parquet files until verification passes
- Add option to archive old files after successful migration
- _Requirements: 8.4, 8.5_
- [ ]* 7.5 Write integration tests for migration
- Test Parquet file discovery and parsing
- Test data migration with sample files
- Test verification logic
- Test rollback on failure
- _Requirements: 8.1, 8.2, 8.3, 8.4_
- [ ] 8. Integrate with existing DataProvider
- [ ] 8.1 Update DataProvider class to use UnifiedDataProvider
- Replace existing data retrieval methods with unified API calls
- Update get_data() method to use get_inference_data()
- Update multi-timeframe methods to use get_multi_timeframe_data()
- Maintain backward compatibility with existing interfaces
- _Requirements: 1.1, 1.2, 1.3, 8.6_
- [ ] 8.2 Update real-time data flow
- Connect WebSocket data to DataIngestionPipeline
- Update tick aggregator to write to cache and database
- Update COB integration to use new ingestion methods
- Ensure no data loss during transition
- _Requirements: 2.1, 2.2, 5.1, 5.3, 8.6_
- [ ] 8.3 Update annotation system integration
- Update ANNOTATE/core/data_loader.py to use unified API
- Ensure annotation system uses get_inference_data() with timestamps
- Test annotation workflow with new data provider
- _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5_
- [ ] 8.4 Update backtesting system integration
- Update backtesting data access to use unified API
- Ensure sequential data access works efficiently
- Test backtesting performance with new data provider
- _Requirements: 6.1, 6.2, 6.3, 6.4, 6.5_
- [ ]* 8.5 Write end-to-end integration tests
- Test complete data flow: WebSocket ingestion cache database retrieval
- Test annotation system with unified data provider
- Test backtesting system with unified data provider
- Test real-time trading with unified data provider
- _Requirements: 1.1, 1.2, 1.3, 6.1, 6.2, 7.1, 8.6_
- [ ] 9. Performance optimization and monitoring
- [ ] 9.1 Implement performance monitoring
- Add latency tracking for cache reads (<10ms target)
- Add latency tracking for database queries (<100ms target)
- Add throughput monitoring for ingestion (>1000 ops/sec target)
- Create performance dashboard or logging
- _Requirements: 5.2, 6.5, 9.1, 9.2, 9.3_
- [ ] 9.2 Optimize database queries
- Analyze query execution plans
- Add missing indexes if needed
- Optimize time_bucket usage
- Implement query result caching where appropriate
- _Requirements: 6.5, 9.2, 9.3, 9.6_
- [ ] 9.3 Implement compression and retention
- Verify compression policies are working (>80% compression target)
- Monitor storage growth over time
- Verify retention policies are cleaning old data
- Add alerts for storage issues
- _Requirements: 2.6, 9.5_
- [ ]* 9.4 Write performance tests
- Test cache read latency under load
- Test database query latency with various time ranges
- Test ingestion throughput with high-frequency data
- Test concurrent access patterns
- _Requirements: 5.2, 6.5, 9.1, 9.2, 9.3, 9.6_
- [ ] 10. Documentation and deployment
- [ ] 10.1 Create deployment documentation
- Document TimescaleDB setup and configuration
- Document migration process and steps
- Document rollback procedures
- Create troubleshooting guide
- _Requirements: 8.1, 8.2, 8.3, 8.4, 8.5, 8.6_
- [ ] 10.2 Create API documentation
- Document UnifiedDataProvider API methods
- Provide usage examples for each method
- Document data models and structures
- Create migration guide for existing code
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5_
- [ ] 10.3 Create monitoring and alerting setup
- Document key metrics to monitor
- Set up alerts for performance degradation
- Set up alerts for data validation failures
- Create operational runbook
- _Requirements: 9.1, 9.2, 9.3, 9.5, 9.6, 10.4_
- [ ] 10.4 Execute phased deployment
- Phase 1: Deploy with dual-write (Parquet + TimescaleDB)
- Phase 2: Run migration script for historical data
- Phase 3: Verify data integrity
- Phase 4: Switch reads to TimescaleDB
- Phase 5: Deprecate Parquet writes
- Phase 6: Archive old Parquet files
- _Requirements: 8.1, 8.2, 8.3, 8.4, 8.5, 8.6_

View File

@@ -0,0 +1,285 @@
# Real-Time Inference Guide
## 🎯 Overview
Real-time inference mode runs your trained model on **live streaming data** from the DataProvider, updating charts every second and displaying model predictions in real-time.
---
## 🚀 Starting Real-Time Inference
### Step 1: Select Model
Choose the model you want to run from the dropdown in the training panel.
### Step 2: Click "Start Live Inference"
- Button turns red: "Stop Inference"
- Live mode banner appears at top
- Charts begin updating every second
- Model predictions displayed
### Visual Indicators
- **🔴 LIVE banner** at top of page
- **Green status box** in training panel
- **Update counter** showing number of updates
- **Signal markers** on charts (🔵 BUY, 🔴 SELL)
---
## 📊 What Updates in Real-Time
### Charts (Every 1 Second)
- **All 4 timeframes** update with latest data
- **Candlesticks** show new price action
- **Volume bars** update with new volume
- **Smooth updates** without page refresh
### Model Signals
- **Latest prediction** displayed (BUY/SELL/HOLD)
- **Confidence level** shown as percentage
- **Signal markers** added to charts
- **Last 10 signals** kept visible
### Data Source
- Uses **DataProvider's cached data**
- Same data as main trading system
- Updates from exchange feeds
- 1-second resolution
---
## 🎨 Visual Elements
### Live Mode Banner
```
🔴 LIVE | Real-Time Inference Active
Charts updating with live data every second
[X updates]
```
### Signal Markers on Charts
- **🔵 BUY** - Green marker with arrow
- **🔴 SELL** - Red marker with arrow
- **Timestamp** - When signal was generated
- **Price** - Price at signal time
### Training Panel Status
```
🔴 LIVE
Signal: BUY
Confidence: 75.3%
Charts updating every 1s
```
---
## 🛑 Stopping Real-Time Inference
### Click "Stop Inference"
- Live mode banner disappears
- Charts stop updating
- Signal markers remain visible
- Can review final signals
### What Happens
- Inference loop terminates
- Chart updates stop
- Last 100 signals saved
- Model remains loaded
---
## 📈 Monitoring Performance
### Watch For
- **Signal frequency** - How often model signals
- **Confidence levels** - Higher is better (>70%)
- **Signal accuracy** - Do signals make sense?
- **False positives** - Signals that shouldn't happen
### Good Signs
- ✅ Signals at key levels (support/resistance)
- ✅ High confidence (>70%)
- ✅ Signals match your analysis
- ✅ Few false positives
### Warning Signs
- ⚠️ Too many signals (every second)
- ⚠️ Low confidence (<50%)
- Random signals
- Signals don't match patterns
---
## 🔧 Technical Details
### Update Frequency
- **Charts**: 1 second
- **Signals**: 1 second
- **Model inference**: 1 second
### Data Flow
```
DataProvider (Live Data)
Latest Market State (4 timeframes)
Model Inference
Prediction (Action + Confidence)
Update Charts + Display Signal
```
### Performance
- **Latency**: ~100-200ms per update
- **CPU Usage**: Moderate (model inference)
- **Memory**: Stable (no leaks)
- **Network**: Minimal (uses cached data)
---
## 💡 Tips & Tricks
### Tip 1: Watch Multiple Timeframes
All 4 charts update simultaneously. Watch for:
- Alignment across timeframes
- Divergences between timeframes
- Pattern confirmation
### Tip 2: Monitor Confidence
- **>80%**: Very strong signal
- **70-80%**: Strong signal
- **50-70%**: Moderate signal
- **<50%**: Weak signal (ignore)
### Tip 3: Compare with Annotations
- Do live signals match your annotations?
- Are signals at similar price levels?
- Is timing similar to your trades?
### Tip 4: Test Different Models
- Try CNN vs DQN vs Transformer
- Compare signal quality
- Note which performs best
### Tip 5: Use for Validation
- After training, test with live inference
- Verify model learned correctly
- Check for overfitting
---
## 🐛 Troubleshooting
### Charts Not Updating
**Issue**: Live mode active but charts frozen
**Solutions**:
- Check browser console for errors
- Verify DataProvider has live data
- Refresh page and restart inference
- Check network tab for failed requests
### No Signals Generated
**Issue**: Status shows "HOLD" constantly
**Solutions**:
- Model may need more training
- Check model is loaded correctly
- Verify market conditions (model may correctly hold)
- Try different model
### Signals Too Frequent
**Issue**: Signal every second
**Solutions**:
- Model may be overtrained
- Need more negative examples in training
- Adjust confidence threshold
- Retrain with better annotations
### Performance Issues
**Issue**: Browser slow/laggy
**Solutions**:
- Close other tabs
- Reduce number of visible timeframes
- Stop inference when not needed
- Clear browser cache
---
## 📊 Example Session
### Scenario: Testing CNN After Training
**1. Preparation**
- Trained CNN on 20 breakout annotations
- Model learned breakout patterns
- Ready to test on live data
**2. Start Inference**
- Select "StandardizedCNN"
- Click "Start Live Inference"
- 🔴 LIVE banner appears
- Charts begin updating
**3. Observation (5 minutes)**
- Charts update smoothly
- Model generates 2 BUY signals
- Both at resistance breakouts
- Confidence: 78% and 82%
**4. Validation**
- Signals match training patterns
- Timing is precise
- No false positives
- Model learned correctly
**5. Stop Inference**
- Click "Stop Inference"
- Review signal history
- Model performs well
- Ready for production
---
## 🎯 Best Practices
### Before Starting
- Train model first
- Verify model loaded
- Check DataProvider has data
- Close unnecessary tabs
### During Inference
- Monitor all timeframes
- Note signal quality
- Check confidence levels
- Compare with your analysis
### After Stopping
- Review signal history
- Note performance
- Identify improvements
- Adjust training if needed
---
## 🚀 Summary
Real-time inference provides:
**Live chart updates** (1/second)
**Model predictions** in real-time
**Signal markers** on charts
**Confidence levels** displayed
**Performance monitoring** built-in
Use it to:
- **Validate training** - Check model learned correctly
- **Test models** - Compare different models
- **Monitor performance** - Track signal quality
- **Debug issues** - Identify problems
**Result**: Confidence that your model works correctly before deploying to production! 🎯

View File

@@ -13,7 +13,7 @@
<option value="">Loading models...</option>
</select>
</div>
<!-- Training Controls -->
<div class="mb-3">
<button class="btn btn-primary btn-sm w-100" id="train-model-btn">
@@ -21,7 +21,7 @@
Train Model
</button>
</div>
<!-- Training Status -->
<div id="training-status" style="display: none;">
<div class="alert alert-info py-2 px-2 mb-2">
@@ -32,10 +32,8 @@
<strong class="small">Training in progress</strong>
</div>
<div class="progress mb-1" style="height: 10px;">
<div class="progress-bar progress-bar-striped progress-bar-animated"
id="training-progress-bar"
role="progressbar"
style="width: 0%"></div>
<div class="progress-bar progress-bar-striped progress-bar-animated" id="training-progress-bar"
role="progressbar" style="width: 0%"></div>
</div>
<div class="small">
<div>Epoch: <span id="training-epoch">0</span>/<span id="training-total-epochs">0</span></div>
@@ -43,7 +41,7 @@
</div>
</div>
</div>
<!-- Training Results -->
<div id="training-results" style="display: none;">
<div class="alert alert-success py-2 px-2 mb-2">
@@ -58,7 +56,7 @@
</div>
</div>
</div>
<!-- Real-Time Inference -->
<div class="mb-3">
<label class="form-label small">Real-Time Inference</label>
@@ -71,7 +69,7 @@
Stop Inference
</button>
</div>
<!-- Inference Status -->
<div id="inference-status" style="display: none;">
<div class="alert alert-success py-2 px-2 mb-2">
@@ -88,7 +86,7 @@
</div>
</div>
</div>
<!-- Test Case Stats -->
<div class="small text-muted">
<div class="d-flex justify-content-between">
@@ -111,7 +109,7 @@
.then(data => {
const modelSelect = document.getElementById('model-select');
modelSelect.innerHTML = '';
if (data.success && data.models.length > 0) {
data.models.forEach(model => {
const option = document.createElement('option');
@@ -132,202 +130,214 @@
modelSelect.innerHTML = '<option value="">Error loading models</option>';
});
}
// Load models when page loads
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', loadAvailableModels);
} else {
loadAvailableModels();
}
// Train model button
document.getElementById('train-model-btn').addEventListener('click', function() {
document.getElementById('train-model-btn').addEventListener('click', function () {
const modelName = document.getElementById('model-select').value;
if (appState.annotations.length === 0) {
showError('No annotations available for training');
return;
}
// Get annotation IDs
const annotationIds = appState.annotations.map(a => a.annotation_id);
// Start training
startTraining(modelName, annotationIds);
});
function startTraining(modelName, annotationIds) {
// Show training status
document.getElementById('training-status').style.display = 'block';
document.getElementById('training-results').style.display = 'none';
document.getElementById('train-model-btn').disabled = true;
// Reset progress
document.getElementById('training-progress-bar').style.width = '0%';
document.getElementById('training-epoch').textContent = '0';
document.getElementById('training-loss').textContent = '--';
// Start training request
fetch('/api/train-model', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model_name: modelName,
annotation_ids: annotationIds
})
})
.then(response => response.json())
.then(data => {
if (data.success) {
// Start polling for training progress
pollTrainingProgress(data.training_id);
} else {
showError('Failed to start training: ' + data.error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
}
})
.catch(error => {
showError('Network error: ' + error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
});
}
function pollTrainingProgress(trainingId) {
const pollInterval = setInterval(function() {
fetch('/api/training-progress', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({training_id: trainingId})
})
.then(response => response.json())
.then(data => {
if (data.success) {
const progress = data.progress;
// Update progress bar
const percentage = (progress.current_epoch / progress.total_epochs) * 100;
document.getElementById('training-progress-bar').style.width = percentage + '%';
document.getElementById('training-epoch').textContent = progress.current_epoch;
document.getElementById('training-total-epochs').textContent = progress.total_epochs;
document.getElementById('training-loss').textContent = progress.current_loss.toFixed(4);
// Check if complete
if (progress.status === 'completed') {
clearInterval(pollInterval);
showTrainingResults(progress);
} else if (progress.status === 'failed') {
clearInterval(pollInterval);
showError('Training failed: ' + progress.error);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
}
// Start polling for training progress
pollTrainingProgress(data.training_id);
} else {
showError('Failed to start training: ' + data.error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
}
})
.catch(error => {
clearInterval(pollInterval);
showError('Failed to get training progress: ' + error.message);
showError('Network error: ' + error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
});
}
function pollTrainingProgress(trainingId) {
const pollInterval = setInterval(function () {
fetch('/api/training-progress', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ training_id: trainingId })
})
.then(response => response.json())
.then(data => {
if (data.success) {
const progress = data.progress;
// Update progress bar
const percentage = (progress.current_epoch / progress.total_epochs) * 100;
document.getElementById('training-progress-bar').style.width = percentage + '%';
document.getElementById('training-epoch').textContent = progress.current_epoch;
document.getElementById('training-total-epochs').textContent = progress.total_epochs;
document.getElementById('training-loss').textContent = progress.current_loss.toFixed(4);
// Check if complete
if (progress.status === 'completed') {
clearInterval(pollInterval);
showTrainingResults(progress);
} else if (progress.status === 'failed') {
clearInterval(pollInterval);
showError('Training failed: ' + progress.error);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
}
}
})
.catch(error => {
clearInterval(pollInterval);
showError('Failed to get training progress: ' + error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
});
}, 1000); // Poll every second
}
function showTrainingResults(results) {
// Hide training status
document.getElementById('training-status').style.display = 'none';
// Show results
document.getElementById('training-results').style.display = 'block';
document.getElementById('result-loss').textContent = results.final_loss.toFixed(4);
document.getElementById('result-accuracy').textContent = (results.accuracy * 100).toFixed(2) + '%';
document.getElementById('result-duration').textContent = results.duration_seconds.toFixed(1) + 's';
// Update last training time
document.getElementById('last-training-time').textContent = new Date().toLocaleTimeString();
// Re-enable train button
document.getElementById('train-model-btn').disabled = false;
showSuccess('Training completed successfully');
}
// Real-time inference controls
let currentInferenceId = null;
let signalPollInterval = null;
document.getElementById('start-inference-btn').addEventListener('click', function() {
document.getElementById('start-inference-btn').addEventListener('click', function () {
const modelName = document.getElementById('model-select').value;
if (!modelName) {
showError('Please select a model first');
return;
}
// Start real-time inference
fetch('/api/realtime-inference/start', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model_name: modelName,
symbol: appState.currentSymbol
})
})
.then(response => response.json())
.then(data => {
if (data.success) {
currentInferenceId = data.inference_id;
// Update UI
document.getElementById('start-inference-btn').style.display = 'none';
document.getElementById('stop-inference-btn').style.display = 'block';
document.getElementById('inference-status').style.display = 'block';
// Start polling for signals
startSignalPolling();
showSuccess('Real-time inference started');
} else {
showError('Failed to start inference: ' + data.error.message);
}
})
.catch(error => {
showError('Network error: ' + error.message);
});
.then(response => response.json())
.then(data => {
if (data.success) {
currentInferenceId = data.inference_id;
// Update UI
document.getElementById('start-inference-btn').style.display = 'none';
document.getElementById('stop-inference-btn').style.display = 'block';
document.getElementById('inference-status').style.display = 'block';
// Show live mode banner
const banner = document.getElementById('live-mode-banner');
if (banner) {
banner.style.display = 'block';
}
// Start polling for signals
startSignalPolling();
showSuccess('Real-time inference started - Charts now updating live');
} else {
showError('Failed to start inference: ' + data.error.message);
}
})
.catch(error => {
showError('Network error: ' + error.message);
});
});
document.getElementById('stop-inference-btn').addEventListener('click', function() {
document.getElementById('stop-inference-btn').addEventListener('click', function () {
if (!currentInferenceId) return;
// Stop real-time inference
fetch('/api/realtime-inference/stop', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({inference_id: currentInferenceId})
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ inference_id: currentInferenceId })
})
.then(response => response.json())
.then(data => {
if (data.success) {
// Update UI
document.getElementById('start-inference-btn').style.display = 'block';
document.getElementById('stop-inference-btn').style.display = 'none';
document.getElementById('inference-status').style.display = 'none';
// Stop polling
stopSignalPolling();
currentInferenceId = null;
showSuccess('Real-time inference stopped');
}
})
.catch(error => {
showError('Network error: ' + error.message);
});
.then(response => response.json())
.then(data => {
if (data.success) {
// Update UI
document.getElementById('start-inference-btn').style.display = 'block';
document.getElementById('stop-inference-btn').style.display = 'none';
document.getElementById('inference-status').style.display = 'none';
// Hide live mode banner
const banner = document.getElementById('live-mode-banner');
if (banner) {
banner.style.display = 'none';
}
// Stop polling
stopSignalPolling();
currentInferenceId = null;
showSuccess('Real-time inference stopped');
}
})
.catch(error => {
showError('Network error: ' + error.message);
});
});
function startSignalPolling() {
signalPollInterval = setInterval(function() {
signalPollInterval = setInterval(function () {
// Poll for signals
fetch('/api/realtime-inference/signals')
.then(response => response.json())
@@ -335,9 +345,9 @@
if (data.success && data.signals.length > 0) {
const latest = data.signals[0];
document.getElementById('latest-signal').textContent = latest.action;
document.getElementById('latest-confidence').textContent =
document.getElementById('latest-confidence').textContent =
(latest.confidence * 100).toFixed(1) + '%';
// Update chart with signal markers
if (appState.chartManager) {
displaySignalOnChart(latest);
@@ -347,17 +357,17 @@
.catch(error => {
console.error('Error polling signals:', error);
});
// Update charts with latest data
updateChartsWithLiveData();
}, 1000); // Poll every second
}
function updateChartsWithLiveData() {
// Fetch latest chart data
fetch('/api/chart-data', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
symbol: appState.currentSymbol,
timeframes: appState.currentTimeframes,
@@ -365,72 +375,85 @@
end_time: null
})
})
.then(response => response.json())
.then(data => {
if (data.success && appState.chartManager) {
// Update each chart with new data
Object.keys(data.chart_data).forEach(timeframe => {
const chartData = data.chart_data[timeframe];
if (appState.chartManager.charts[timeframe]) {
updateSingleChart(timeframe, chartData);
}
});
}
})
.catch(error => {
console.error('Error updating charts:', error);
});
.then(response => response.json())
.then(data => {
if (data.success && appState.chartManager) {
// Update each chart with new data
Object.keys(data.chart_data).forEach(timeframe => {
const chartData = data.chart_data[timeframe];
if (appState.chartManager.charts[timeframe]) {
updateSingleChart(timeframe, chartData);
}
});
}
})
.catch(error => {
console.error('Error updating charts:', error);
});
}
let liveUpdateCount = 0;
function updateSingleChart(timeframe, newData) {
const chart = appState.chartManager.charts[timeframe];
if (!chart) return;
// Update candlestick data
Plotly.update(chart.plotId, {
x: [newData.timestamps],
open: [newData.open],
high: [newData.high],
low: [newData.low],
close: [newData.close]
}, {}, [0]);
// Update volume data
const volumeColors = newData.close.map((close, i) => {
if (i === 0) return '#3b82f6';
return close >= newData.open[i] ? '#10b981' : '#ef4444';
});
Plotly.update(chart.plotId, {
x: [newData.timestamps],
y: [newData.volume],
'marker.color': [volumeColors]
}, {}, [1]);
try {
// Update candlestick data
Plotly.update(chart.plotId, {
x: [newData.timestamps],
open: [newData.open],
high: [newData.high],
low: [newData.low],
close: [newData.close]
}, {}, [0]);
// Update volume data
const volumeColors = newData.close.map((close, i) => {
if (i === 0) return '#3b82f6';
return close >= newData.open[i] ? '#10b981' : '#ef4444';
});
Plotly.update(chart.plotId, {
x: [newData.timestamps],
y: [newData.volume],
'marker.color': [volumeColors]
}, {}, [1]);
// Update counter
liveUpdateCount++;
const counterEl = document.getElementById('live-update-count');
if (counterEl) {
counterEl.textContent = liveUpdateCount + ' updates';
}
} catch (error) {
console.error('Error updating chart:', timeframe, error);
}
}
function stopSignalPolling() {
if (signalPollInterval) {
clearInterval(signalPollInterval);
signalPollInterval = null;
}
}
function displaySignalOnChart(signal) {
// Add signal marker to chart
if (!appState.chartManager || !appState.chartManager.charts) return;
// Add marker to all timeframe charts
Object.keys(appState.chartManager.charts).forEach(timeframe => {
const chart = appState.chartManager.charts[timeframe];
if (!chart) return;
// Get current annotations
const currentAnnotations = chart.element.layout.annotations || [];
// Determine marker based on signal
let markerText = '';
let markerColor = '#9ca3af';
if (signal.action === 'BUY') {
markerText = '🔵 BUY';
markerColor = '#10b981';
@@ -440,7 +463,7 @@
} else {
return; // Don't show HOLD signals
}
// Add new signal marker
const newAnnotation = {
x: signal.timestamp,
@@ -460,25 +483,25 @@
borderpad: 4,
opacity: 0.8
};
// Keep only last 10 signal markers
const signalAnnotations = currentAnnotations.filter(ann =>
const signalAnnotations = currentAnnotations.filter(ann =>
ann.text && (ann.text.includes('BUY') || ann.text.includes('SELL'))
).slice(-9);
// Combine with existing non-signal annotations
const otherAnnotations = currentAnnotations.filter(ann =>
const otherAnnotations = currentAnnotations.filter(ann =>
!ann.text || (!ann.text.includes('BUY') && !ann.text.includes('SELL'))
);
const allAnnotations = [...otherAnnotations, ...signalAnnotations, newAnnotation];
// Update chart
Plotly.relayout(chart.plotId, {
annotations: allAnnotations
});
});
console.log('Signal displayed:', signal.action, '@', signal.price);
}
</script>
</script>

View File

View File

@@ -0,0 +1,553 @@
"""
Unified Data Storage Schema Manager
Extends COBY schema with additional tables for unified data storage system.
"""
import logging
from typing import Dict, List, Optional
from datetime import datetime
import asyncpg
logger = logging.getLogger(__name__)
class UnifiedStorageSchemaManager:
"""
Manages unified data storage schema with TimescaleDB.
Extends existing COBY schema with additional tables for:
- Enhanced OHLCV with technical indicators
- Order book 1s aggregations with price buckets
- Multi-timeframe order book imbalances
"""
def __init__(self, connection_pool):
self.pool = connection_pool
self.current_version = "2.0.0"
async def initialize_schema_tracking(self) -> None:
"""Initialize schema version tracking table."""
try:
async with self.pool.acquire() as conn:
await conn.execute("""
CREATE TABLE IF NOT EXISTS schema_migrations (
version VARCHAR(20) PRIMARY KEY,
applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
description TEXT,
checksum VARCHAR(64)
);
""")
logger.info("Schema tracking initialized")
except Exception as e:
logger.error(f"Failed to initialize schema tracking: {e}")
raise
async def apply_migration(self, version: str, description: str, sql_commands: List[str]) -> bool:
"""Apply a database migration."""
try:
async with self.pool.acquire() as conn:
async with conn.transaction():
# Check if migration already applied
existing = await conn.fetchval("""
SELECT version FROM schema_migrations WHERE version = $1
""", version)
if existing:
logger.info(f"Migration {version} already applied")
return True
# Apply migration commands
for sql_command in sql_commands:
try:
await conn.execute(sql_command)
except Exception as cmd_error:
logger.error(f"Error executing command: {sql_command[:100]}... Error: {cmd_error}")
raise
# Record migration
await conn.execute("""
INSERT INTO schema_migrations (version, description)
VALUES ($1, $2)
""", version, description)
logger.info(f"Applied migration {version}: {description}")
return True
except Exception as e:
logger.error(f"Failed to apply migration {version}: {e}")
return False
async def create_enhanced_ohlcv_table(self) -> bool:
"""Create enhanced OHLCV table with technical indicators."""
migration_commands = [
"""
CREATE TABLE IF NOT EXISTS ohlcv_data (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
timeframe VARCHAR(10) NOT NULL,
open_price DECIMAL(20,8) NOT NULL,
high_price DECIMAL(20,8) NOT NULL,
low_price DECIMAL(20,8) NOT NULL,
close_price DECIMAL(20,8) NOT NULL,
volume DECIMAL(30,8) NOT NULL,
trade_count INTEGER DEFAULT 0,
-- Technical Indicators (pre-calculated)
rsi_14 DECIMAL(10,4),
macd DECIMAL(20,8),
macd_signal DECIMAL(20,8),
macd_histogram DECIMAL(20,8),
bb_upper DECIMAL(20,8),
bb_middle DECIMAL(20,8),
bb_lower DECIMAL(20,8),
ema_12 DECIMAL(20,8),
ema_26 DECIMAL(20,8),
sma_20 DECIMAL(20,8),
PRIMARY KEY (timestamp, symbol, timeframe)
);
"""
]
return await self.apply_migration(
"2.0.0",
"Create enhanced OHLCV table with technical indicators",
migration_commands
)
async def create_order_book_tables(self) -> bool:
"""Create order book related tables."""
migration_commands = [
# Order book snapshots
"""
CREATE TABLE IF NOT EXISTS order_book_snapshots (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
exchange VARCHAR(20) NOT NULL DEFAULT 'binance',
bids JSONB NOT NULL,
asks JSONB NOT NULL,
mid_price DECIMAL(20,8),
spread DECIMAL(20,8),
bid_volume DECIMAL(30,8),
ask_volume DECIMAL(30,8),
sequence_id BIGINT,
PRIMARY KEY (timestamp, symbol, exchange)
);
""",
# Order book 1s aggregations with price buckets
"""
CREATE TABLE IF NOT EXISTS order_book_1s_agg (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
price_bucket DECIMAL(20,2) NOT NULL,
bid_volume DECIMAL(30,8) DEFAULT 0,
ask_volume DECIMAL(30,8) DEFAULT 0,
bid_count INTEGER DEFAULT 0,
ask_count INTEGER DEFAULT 0,
imbalance DECIMAL(10,6) DEFAULT 0,
PRIMARY KEY (timestamp, symbol, price_bucket)
);
""",
# Multi-timeframe order book imbalances
"""
CREATE TABLE IF NOT EXISTS order_book_imbalances (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
imbalance_1s DECIMAL(10,6) DEFAULT 0,
imbalance_5s DECIMAL(10,6) DEFAULT 0,
imbalance_15s DECIMAL(10,6) DEFAULT 0,
imbalance_60s DECIMAL(10,6) DEFAULT 0,
volume_imbalance_1s DECIMAL(10,6) DEFAULT 0,
volume_imbalance_5s DECIMAL(10,6) DEFAULT 0,
volume_imbalance_15s DECIMAL(10,6) DEFAULT 0,
volume_imbalance_60s DECIMAL(10,6) DEFAULT 0,
price_range DECIMAL(10,2),
PRIMARY KEY (timestamp, symbol)
);
"""
]
return await self.apply_migration(
"2.0.1",
"Create order book tables (snapshots, 1s aggregations, imbalances)",
migration_commands
)
async def create_trade_events_table(self) -> bool:
"""Create trade events table."""
migration_commands = [
"""
CREATE TABLE IF NOT EXISTS trade_events (
timestamp TIMESTAMPTZ NOT NULL,
symbol VARCHAR(20) NOT NULL,
exchange VARCHAR(20) NOT NULL DEFAULT 'binance',
price DECIMAL(20,8) NOT NULL,
size DECIMAL(30,8) NOT NULL,
side VARCHAR(4) NOT NULL,
trade_id VARCHAR(100) NOT NULL,
is_buyer_maker BOOLEAN,
PRIMARY KEY (timestamp, symbol, exchange, trade_id)
);
"""
]
return await self.apply_migration(
"2.0.2",
"Create trade events table",
migration_commands
)
async def create_hypertables(self) -> bool:
"""Convert tables to TimescaleDB hypertables."""
hypertable_commands = [
"SELECT create_hypertable('ohlcv_data', 'timestamp', if_not_exists => TRUE);",
"SELECT create_hypertable('order_book_snapshots', 'timestamp', if_not_exists => TRUE);",
"SELECT create_hypertable('order_book_1s_agg', 'timestamp', if_not_exists => TRUE);",
"SELECT create_hypertable('order_book_imbalances', 'timestamp', if_not_exists => TRUE);",
"SELECT create_hypertable('trade_events', 'timestamp', if_not_exists => TRUE);"
]
return await self.apply_migration(
"2.0.3",
"Convert tables to TimescaleDB hypertables",
hypertable_commands
)
async def create_indexes(self) -> bool:
"""Create performance indexes."""
index_commands = [
# OHLCV indexes
"CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_tf_ts ON ohlcv_data (symbol, timeframe, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_ts ON ohlcv_data (symbol, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_ohlcv_tf_ts ON ohlcv_data (timeframe, timestamp DESC);",
# Order book snapshots indexes
"CREATE INDEX IF NOT EXISTS idx_obs_symbol_ts ON order_book_snapshots (symbol, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_obs_exchange_ts ON order_book_snapshots (exchange, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_obs_symbol_exchange_ts ON order_book_snapshots (symbol, exchange, timestamp DESC);",
# Order book 1s aggregation indexes
"CREATE INDEX IF NOT EXISTS idx_ob1s_symbol_ts ON order_book_1s_agg (symbol, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_ob1s_symbol_bucket_ts ON order_book_1s_agg (symbol, price_bucket, timestamp DESC);",
# Order book imbalances indexes
"CREATE INDEX IF NOT EXISTS idx_obi_symbol_ts ON order_book_imbalances (symbol, timestamp DESC);",
# Trade events indexes
"CREATE INDEX IF NOT EXISTS idx_trades_symbol_ts ON trade_events (symbol, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_trades_exchange_ts ON trade_events (exchange, timestamp DESC);",
"CREATE INDEX IF NOT EXISTS idx_trades_symbol_side_ts ON trade_events (symbol, side, timestamp DESC);"
]
return await self.apply_migration(
"2.0.4",
"Create performance indexes",
index_commands
)
async def create_continuous_aggregates(self) -> bool:
"""Create continuous aggregates for multi-timeframe data."""
aggregate_commands = [
# 1m OHLCV from 1s data
"""
CREATE MATERIALIZED VIEW IF NOT EXISTS ohlcv_1m_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('1 minute', timestamp) AS timestamp,
symbol,
'1m' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '1s'
GROUP BY time_bucket('1 minute', timestamp), symbol
WITH NO DATA;
""",
# 5m OHLCV from 1m data
"""
CREATE MATERIALIZED VIEW IF NOT EXISTS ohlcv_5m_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('5 minutes', timestamp) AS timestamp,
symbol,
'5m' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '1m'
GROUP BY time_bucket('5 minutes', timestamp), symbol
WITH NO DATA;
""",
# 15m OHLCV from 5m data
"""
CREATE MATERIALIZED VIEW IF NOT EXISTS ohlcv_15m_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('15 minutes', timestamp) AS timestamp,
symbol,
'15m' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '5m'
GROUP BY time_bucket('15 minutes', timestamp), symbol
WITH NO DATA;
""",
# 1h OHLCV from 15m data
"""
CREATE MATERIALIZED VIEW IF NOT EXISTS ohlcv_1h_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('1 hour', timestamp) AS timestamp,
symbol,
'1h' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '15m'
GROUP BY time_bucket('1 hour', timestamp), symbol
WITH NO DATA;
""",
# 1d OHLCV from 1h data
"""
CREATE MATERIALIZED VIEW IF NOT EXISTS ohlcv_1d_continuous
WITH (timescaledb.continuous) AS
SELECT
time_bucket('1 day', timestamp) AS timestamp,
symbol,
'1d' AS timeframe,
first(open_price, timestamp) AS open_price,
max(high_price) AS high_price,
min(low_price) AS low_price,
last(close_price, timestamp) AS close_price,
sum(volume) AS volume,
sum(trade_count) AS trade_count
FROM ohlcv_data
WHERE timeframe = '1h'
GROUP BY time_bucket('1 day', timestamp), symbol
WITH NO DATA;
"""
]
return await self.apply_migration(
"2.0.5",
"Create continuous aggregates for multi-timeframe OHLCV",
aggregate_commands
)
async def setup_compression_policies(self) -> bool:
"""Set up compression policies for efficient storage."""
compression_commands = [
# Compress OHLCV data older than 7 days
"SELECT add_compression_policy('ohlcv_data', INTERVAL '7 days', if_not_exists => TRUE);",
# Compress order book snapshots older than 1 day
"SELECT add_compression_policy('order_book_snapshots', INTERVAL '1 day', if_not_exists => TRUE);",
# Compress order book 1s aggregations older than 2 days
"SELECT add_compression_policy('order_book_1s_agg', INTERVAL '2 days', if_not_exists => TRUE);",
# Compress order book imbalances older than 2 days
"SELECT add_compression_policy('order_book_imbalances', INTERVAL '2 days', if_not_exists => TRUE);",
# Compress trade events older than 7 days
"SELECT add_compression_policy('trade_events', INTERVAL '7 days', if_not_exists => TRUE);"
]
return await self.apply_migration(
"2.0.6",
"Setup compression policies",
compression_commands
)
async def setup_retention_policies(self) -> bool:
"""Set up data retention policies."""
retention_commands = [
# Retain OHLCV data for 2 years
"SELECT add_retention_policy('ohlcv_data', INTERVAL '2 years', if_not_exists => TRUE);",
# Retain order book snapshots for 30 days
"SELECT add_retention_policy('order_book_snapshots', INTERVAL '30 days', if_not_exists => TRUE);",
# Retain order book 1s aggregations for 60 days
"SELECT add_retention_policy('order_book_1s_agg', INTERVAL '60 days', if_not_exists => TRUE);",
# Retain order book imbalances for 60 days
"SELECT add_retention_policy('order_book_imbalances', INTERVAL '60 days', if_not_exists => TRUE);",
# Retain trade events for 90 days
"SELECT add_retention_policy('trade_events', INTERVAL '90 days', if_not_exists => TRUE);"
]
return await self.apply_migration(
"2.0.7",
"Setup retention policies",
retention_commands
)
async def setup_complete_schema(self) -> bool:
"""Set up the complete unified storage schema."""
try:
logger.info("Setting up unified storage schema...")
# Initialize schema tracking
await self.initialize_schema_tracking()
# Apply all migrations in order
migrations = [
("Enhanced OHLCV table", self.create_enhanced_ohlcv_table),
("Order book tables", self.create_order_book_tables),
("Trade events table", self.create_trade_events_table),
("Hypertables", self.create_hypertables),
("Indexes", self.create_indexes),
("Continuous aggregates", self.create_continuous_aggregates),
("Compression policies", self.setup_compression_policies),
("Retention policies", self.setup_retention_policies),
]
for name, migration_func in migrations:
logger.info(f"Applying migration: {name}")
success = await migration_func()
if not success:
logger.error(f"Failed to apply migration: {name}")
return False
logger.info(f"Successfully applied migration: {name}")
logger.info("Complete unified storage schema setup successful")
return True
except Exception as e:
logger.error(f"Failed to setup complete schema: {e}")
return False
async def get_schema_info(self) -> Dict:
"""Get information about the current schema state."""
try:
async with self.pool.acquire() as conn:
# Get applied migrations
migrations = await conn.fetch("""
SELECT version, applied_at, description
FROM schema_migrations
ORDER BY applied_at
""")
# Get table information
tables = await conn.fetch("""
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size
FROM pg_tables
WHERE schemaname = 'public'
AND tablename IN (
'ohlcv_data',
'order_book_snapshots',
'order_book_1s_agg',
'order_book_imbalances',
'trade_events'
)
ORDER BY tablename
""")
# Get hypertable information
hypertables = await conn.fetch("""
SELECT
hypertable_name,
num_chunks,
compression_enabled,
pg_size_pretty(total_bytes) as total_size,
pg_size_pretty(compressed_total_bytes) as compressed_size
FROM timescaledb_information.hypertables
WHERE hypertable_schema = 'public'
ORDER BY hypertable_name
""")
# Get continuous aggregates
continuous_aggs = await conn.fetch("""
SELECT
view_name,
materialization_hypertable_name,
pg_size_pretty(total_bytes) as size
FROM timescaledb_information.continuous_aggregates
WHERE view_schema = 'public'
ORDER BY view_name
""")
return {
"migrations": [dict(m) for m in migrations],
"tables": [dict(t) for t in tables],
"hypertables": [dict(h) for h in hypertables],
"continuous_aggregates": [dict(c) for c in continuous_aggs]
}
except Exception as e:
logger.error(f"Failed to get schema info: {e}")
return {}
async def verify_schema(self) -> bool:
"""Verify that all required tables and indexes exist."""
try:
async with self.pool.acquire() as conn:
# Check required tables
required_tables = [
'ohlcv_data',
'order_book_snapshots',
'order_book_1s_agg',
'order_book_imbalances',
'trade_events'
]
for table in required_tables:
exists = await conn.fetchval("""
SELECT EXISTS (
SELECT FROM pg_tables
WHERE schemaname = 'public'
AND tablename = $1
)
""", table)
if not exists:
logger.error(f"Required table missing: {table}")
return False
# Check hypertables
for table in required_tables:
is_hypertable = await conn.fetchval("""
SELECT EXISTS (
SELECT FROM timescaledb_information.hypertables
WHERE hypertable_schema = 'public'
AND hypertable_name = $1
)
""", table)
if not is_hypertable:
logger.error(f"Table is not a hypertable: {table}")
return False
logger.info("Schema verification successful")
return True
except Exception as e:
logger.error(f"Schema verification failed: {e}")
return False

View File

@@ -0,0 +1,337 @@
# Unified Data Storage Setup Guide
## Overview
The unified data storage system consolidates all market data storage into a single TimescaleDB backend, replacing fragmented Parquet files, pickle files, and in-memory caches.
## Prerequisites
### 1. PostgreSQL with TimescaleDB
You need PostgreSQL 12+ with TimescaleDB extension installed.
#### Installation Options
**Option A: Docker (Recommended)**
```bash
docker run -d --name timescaledb \
-p 5432:5432 \
-e POSTGRES_PASSWORD=postgres \
-e POSTGRES_DB=trading_data \
timescale/timescaledb:latest-pg14
```
**Option B: Local Installation**
- Follow TimescaleDB installation guide: https://docs.timescale.com/install/latest/
- Create database: `createdb trading_data`
### 2. Python Dependencies
Ensure you have the required Python packages:
```bash
pip install asyncpg
```
## Database Configuration
Update your `config.yaml` with database connection details:
```yaml
database:
host: localhost
port: 5432
name: trading_data
user: postgres
password: postgres
pool_size: 20
```
## Setup Process
### Step 1: Run Setup Script
```bash
python scripts/setup_unified_storage.py
```
This script will:
1. Connect to the database
2. Verify TimescaleDB extension
3. Create all required tables
4. Convert tables to hypertables
5. Create indexes for performance
6. Set up continuous aggregates
7. Configure compression policies
8. Configure retention policies
9. Verify the setup
10. Run basic operation tests
### Step 2: Verify Setup
The setup script will display schema information:
```
=== Schema Information ===
Migrations applied: 8
Tables created: 5
Hypertables: 5
Continuous aggregates: 5
=== Table Sizes ===
ohlcv_data: 8192 bytes
order_book_snapshots: 8192 bytes
order_book_1s_agg: 8192 bytes
order_book_imbalances: 8192 bytes
trade_events: 8192 bytes
=== Hypertables ===
ohlcv_data: 0 chunks, compression=enabled
order_book_snapshots: 0 chunks, compression=enabled
order_book_1s_agg: 0 chunks, compression=enabled
order_book_imbalances: 0 chunks, compression=enabled
trade_events: 0 chunks, compression=enabled
=== Continuous Aggregates ===
ohlcv_1m_continuous: 8192 bytes
ohlcv_5m_continuous: 8192 bytes
ohlcv_15m_continuous: 8192 bytes
ohlcv_1h_continuous: 8192 bytes
ohlcv_1d_continuous: 8192 bytes
```
## Database Schema
### Tables
#### 1. ohlcv_data
Stores candlestick data for all timeframes with pre-calculated technical indicators.
**Columns:**
- `timestamp` (TIMESTAMPTZ): Candle timestamp
- `symbol` (VARCHAR): Trading pair (e.g., 'ETH/USDT')
- `timeframe` (VARCHAR): Timeframe (1s, 1m, 5m, 15m, 1h, 1d)
- `open_price`, `high_price`, `low_price`, `close_price` (DECIMAL): OHLC prices
- `volume` (DECIMAL): Trading volume
- `trade_count` (INTEGER): Number of trades
- Technical indicators: `rsi_14`, `macd`, `macd_signal`, `bb_upper`, `bb_middle`, `bb_lower`, etc.
**Primary Key:** `(timestamp, symbol, timeframe)`
#### 2. order_book_snapshots
Stores raw order book snapshots.
**Columns:**
- `timestamp` (TIMESTAMPTZ): Snapshot timestamp
- `symbol` (VARCHAR): Trading pair
- `exchange` (VARCHAR): Exchange name
- `bids` (JSONB): Bid levels (top 50)
- `asks` (JSONB): Ask levels (top 50)
- `mid_price`, `spread`, `bid_volume`, `ask_volume` (DECIMAL): Calculated metrics
**Primary Key:** `(timestamp, symbol, exchange)`
#### 3. order_book_1s_agg
Stores 1-second aggregated order book data with $1 price buckets.
**Columns:**
- `timestamp` (TIMESTAMPTZ): Aggregation timestamp
- `symbol` (VARCHAR): Trading pair
- `price_bucket` (DECIMAL): Price bucket ($1 increments)
- `bid_volume`, `ask_volume` (DECIMAL): Aggregated volumes
- `bid_count`, `ask_count` (INTEGER): Number of orders
- `imbalance` (DECIMAL): Order book imbalance
**Primary Key:** `(timestamp, symbol, price_bucket)`
#### 4. order_book_imbalances
Stores multi-timeframe order book imbalance metrics.
**Columns:**
- `timestamp` (TIMESTAMPTZ): Calculation timestamp
- `symbol` (VARCHAR): Trading pair
- `imbalance_1s`, `imbalance_5s`, `imbalance_15s`, `imbalance_60s` (DECIMAL): Imbalances
- `volume_imbalance_1s`, `volume_imbalance_5s`, etc. (DECIMAL): Volume-weighted imbalances
- `price_range` (DECIMAL): Price range used for calculation
**Primary Key:** `(timestamp, symbol)`
#### 5. trade_events
Stores individual trade events.
**Columns:**
- `timestamp` (TIMESTAMPTZ): Trade timestamp
- `symbol` (VARCHAR): Trading pair
- `exchange` (VARCHAR): Exchange name
- `price` (DECIMAL): Trade price
- `size` (DECIMAL): Trade size
- `side` (VARCHAR): Trade side ('buy' or 'sell')
- `trade_id` (VARCHAR): Unique trade identifier
**Primary Key:** `(timestamp, symbol, exchange, trade_id)`
### Continuous Aggregates
Continuous aggregates automatically generate higher timeframe data from lower timeframes:
1. **ohlcv_1m_continuous**: 1-minute candles from 1-second data
2. **ohlcv_5m_continuous**: 5-minute candles from 1-minute data
3. **ohlcv_15m_continuous**: 15-minute candles from 5-minute data
4. **ohlcv_1h_continuous**: 1-hour candles from 15-minute data
5. **ohlcv_1d_continuous**: 1-day candles from 1-hour data
### Compression Policies
Data is automatically compressed to save storage:
- **ohlcv_data**: Compress after 7 days
- **order_book_snapshots**: Compress after 1 day
- **order_book_1s_agg**: Compress after 2 days
- **order_book_imbalances**: Compress after 2 days
- **trade_events**: Compress after 7 days
Expected compression ratio: **>80%**
### Retention Policies
Old data is automatically deleted:
- **ohlcv_data**: Retain for 2 years
- **order_book_snapshots**: Retain for 30 days
- **order_book_1s_agg**: Retain for 60 days
- **order_book_imbalances**: Retain for 60 days
- **trade_events**: Retain for 90 days
## Performance Optimization
### Indexes
All tables have optimized indexes for common query patterns:
- Symbol + timestamp queries
- Timeframe-specific queries
- Exchange-specific queries
- Multi-column composite indexes
### Query Performance Targets
- **Cache reads**: <10ms
- **Single timestamp queries**: <100ms
- **Time range queries (1 hour)**: <500ms
- **Ingestion throughput**: >1000 ops/sec
### Best Practices
1. **Use time_bucket for aggregations**:
```sql
SELECT time_bucket('1 minute', timestamp) AS bucket,
symbol,
avg(close_price) AS avg_price
FROM ohlcv_data
WHERE symbol = 'ETH/USDT'
AND timestamp >= NOW() - INTERVAL '1 hour'
GROUP BY bucket, symbol;
```
2. **Query specific timeframes**:
```sql
SELECT * FROM ohlcv_data
WHERE symbol = 'ETH/USDT'
AND timeframe = '1m'
AND timestamp >= NOW() - INTERVAL '1 day'
ORDER BY timestamp DESC;
```
3. **Use continuous aggregates for historical data**:
```sql
SELECT * FROM ohlcv_1h_continuous
WHERE symbol = 'ETH/USDT'
AND timestamp >= NOW() - INTERVAL '7 days'
ORDER BY timestamp DESC;
```
## Monitoring
### Check Database Size
```sql
SELECT
hypertable_name,
pg_size_pretty(total_bytes) AS total_size,
pg_size_pretty(compressed_total_bytes) AS compressed_size,
ROUND((1 - compressed_total_bytes::numeric / total_bytes::numeric) * 100, 2) AS compression_ratio
FROM timescaledb_information.hypertables
WHERE hypertable_schema = 'public';
```
### Check Chunk Information
```sql
SELECT
hypertable_name,
num_chunks,
num_compressed_chunks,
compression_enabled
FROM timescaledb_information.hypertables
WHERE hypertable_schema = 'public';
```
### Check Continuous Aggregate Status
```sql
SELECT
view_name,
materialization_hypertable_name,
pg_size_pretty(total_bytes) AS size
FROM timescaledb_information.continuous_aggregates
WHERE view_schema = 'public';
```
## Troubleshooting
### TimescaleDB Extension Not Found
If you see "TimescaleDB extension not found":
1. Ensure TimescaleDB is installed
2. Connect to database and run: `CREATE EXTENSION timescaledb;`
3. Restart the setup script
### Connection Refused
If you see "connection refused":
1. Check PostgreSQL is running: `pg_isready`
2. Verify connection details in `config.yaml`
3. Check firewall settings
### Permission Denied
If you see "permission denied":
1. Ensure database user has CREATE privileges
2. Grant privileges: `GRANT ALL PRIVILEGES ON DATABASE trading_data TO postgres;`
### Slow Queries
If queries are slow:
1. Check if indexes exist: `\di` in psql
2. Analyze query plan: `EXPLAIN ANALYZE <your query>`
3. Ensure compression is enabled
4. Consider adding more specific indexes
## Next Steps
After setup is complete:
1. **Implement data models** (Task 2)
2. **Implement cache layer** (Task 3)
3. **Implement database connection layer** (Task 4)
4. **Start data migration** from Parquet files (Task 7)
## Support
For issues or questions:
- Check TimescaleDB docs: https://docs.timescale.com/
- Review PostgreSQL logs: `tail -f /var/log/postgresql/postgresql-*.log`
- Enable debug logging in setup script

View File

@@ -0,0 +1,246 @@
#!/usr/bin/env python3
"""
Setup script for unified data storage system.
Initializes TimescaleDB schema and verifies setup.
"""
import asyncio
import asyncpg
import logging
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from core.unified_storage_schema import UnifiedStorageSchemaManager
from core.config import get_config
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
async def create_connection_pool(config):
"""Create database connection pool."""
try:
# Build connection string
db_config = config.get('database', {})
# Default values
host = db_config.get('host', 'localhost')
port = db_config.get('port', 5432)
database = db_config.get('name', 'trading_data')
user = db_config.get('user', 'postgres')
password = db_config.get('password', 'postgres')
logger.info(f"Connecting to database: {host}:{port}/{database}")
pool = await asyncpg.create_pool(
host=host,
port=port,
database=database,
user=user,
password=password,
min_size=2,
max_size=10,
command_timeout=60
)
logger.info("Database connection pool created")
return pool
except Exception as e:
logger.error(f"Failed to create connection pool: {e}")
raise
async def verify_timescaledb(pool):
"""Verify TimescaleDB extension is available."""
try:
async with pool.acquire() as conn:
# Check if TimescaleDB extension exists
result = await conn.fetchval("""
SELECT EXISTS (
SELECT FROM pg_extension WHERE extname = 'timescaledb'
)
""")
if result:
# Get TimescaleDB version
version = await conn.fetchval("SELECT extversion FROM pg_extension WHERE extname = 'timescaledb'")
logger.info(f"TimescaleDB extension found (version {version})")
return True
else:
logger.warning("TimescaleDB extension not found, attempting to create...")
# Try to create extension
await conn.execute("CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE")
logger.info("TimescaleDB extension created successfully")
return True
except Exception as e:
logger.error(f"Failed to verify TimescaleDB: {e}")
logger.error("Please ensure TimescaleDB is installed: https://docs.timescale.com/install/latest/")
return False
async def setup_schema(pool):
"""Set up the complete unified storage schema."""
try:
schema_manager = UnifiedStorageSchemaManager(pool)
logger.info("Starting schema setup...")
success = await schema_manager.setup_complete_schema()
if success:
logger.info("Schema setup completed successfully")
# Verify schema
logger.info("Verifying schema...")
verified = await schema_manager.verify_schema()
if verified:
logger.info("Schema verification passed")
# Get schema info
info = await schema_manager.get_schema_info()
logger.info("\n=== Schema Information ===")
logger.info(f"Migrations applied: {len(info.get('migrations', []))}")
logger.info(f"Tables created: {len(info.get('tables', []))}")
logger.info(f"Hypertables: {len(info.get('hypertables', []))}")
logger.info(f"Continuous aggregates: {len(info.get('continuous_aggregates', []))}")
# Display table sizes
logger.info("\n=== Table Sizes ===")
for table in info.get('tables', []):
logger.info(f" {table['tablename']}: {table['size']}")
# Display hypertables
logger.info("\n=== Hypertables ===")
for ht in info.get('hypertables', []):
logger.info(f" {ht['hypertable_name']}: {ht['num_chunks']} chunks, "
f"compression={'enabled' if ht['compression_enabled'] else 'disabled'}")
# Display continuous aggregates
if info.get('continuous_aggregates'):
logger.info("\n=== Continuous Aggregates ===")
for ca in info.get('continuous_aggregates', []):
logger.info(f" {ca['view_name']}: {ca.get('size', 'N/A')}")
return True
else:
logger.error("Schema verification failed")
return False
else:
logger.error("Schema setup failed")
return False
except Exception as e:
logger.error(f"Error during schema setup: {e}")
return False
async def test_basic_operations(pool):
"""Test basic database operations."""
try:
logger.info("\n=== Testing Basic Operations ===")
async with pool.acquire() as conn:
# Test insert into ohlcv_data
logger.info("Testing OHLCV insert...")
await conn.execute("""
INSERT INTO ohlcv_data
(timestamp, symbol, timeframe, open_price, high_price, low_price, close_price, volume)
VALUES (NOW(), 'ETH/USDT', '1s', 2000.0, 2001.0, 1999.0, 2000.5, 100.0)
ON CONFLICT (timestamp, symbol, timeframe) DO NOTHING
""")
logger.info("✓ OHLCV insert successful")
# Test query
logger.info("Testing OHLCV query...")
result = await conn.fetchrow("""
SELECT * FROM ohlcv_data
WHERE symbol = 'ETH/USDT'
ORDER BY timestamp DESC
LIMIT 1
""")
if result:
logger.info(f"✓ OHLCV query successful: {dict(result)}")
# Test order book insert
logger.info("Testing order book insert...")
await conn.execute("""
INSERT INTO order_book_snapshots
(timestamp, symbol, exchange, bids, asks, mid_price, spread)
VALUES (NOW(), 'ETH/USDT', 'binance', '[]'::jsonb, '[]'::jsonb, 2000.0, 0.1)
ON CONFLICT (timestamp, symbol, exchange) DO NOTHING
""")
logger.info("✓ Order book insert successful")
# Test imbalances insert
logger.info("Testing imbalances insert...")
await conn.execute("""
INSERT INTO order_book_imbalances
(timestamp, symbol, imbalance_1s, imbalance_5s, imbalance_15s, imbalance_60s)
VALUES (NOW(), 'ETH/USDT', 0.5, 0.4, 0.3, 0.2)
ON CONFLICT (timestamp, symbol) DO NOTHING
""")
logger.info("✓ Imbalances insert successful")
logger.info("\n✓ All basic operations successful")
return True
except Exception as e:
logger.error(f"Basic operations test failed: {e}")
return False
async def main():
"""Main setup function."""
logger.info("=== Unified Data Storage Setup ===\n")
pool = None
try:
# Load configuration
config = get_config()
# Create connection pool
pool = await create_connection_pool(config)
# Verify TimescaleDB
if not await verify_timescaledb(pool):
logger.error("TimescaleDB verification failed")
return 1
# Setup schema
if not await setup_schema(pool):
logger.error("Schema setup failed")
return 1
# Test basic operations
if not await test_basic_operations(pool):
logger.error("Basic operations test failed")
return 1
logger.info("\n=== Setup Complete ===")
logger.info("Unified data storage system is ready to use!")
return 0
except Exception as e:
logger.error(f"Setup failed: {e}")
return 1
finally:
if pool:
await pool.close()
logger.info("Database connection pool closed")
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)