""" SQLite Storage for Long-Term OHLCV Data and Annotation Replay This module provides persistent storage for: 1. OHLCV data for all timeframes (unlimited history) 2. Complete annotation data with market context 3. Model predictions and features at annotation time 4. Efficient querying for historical replay Parquet files are used for recent data (1500 candles) for speed. SQLite is used for long-term storage and annotation replay. """ import sqlite3 import logging import pandas as pd from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple, Any import json import pickle logger = logging.getLogger(__name__) class SQLiteStorage: """SQLite storage for OHLCV data and annotations""" def __init__(self, db_path: str = "cache/trading_data.db"): """Initialize SQLite storage""" self.db_path = Path(db_path) self.db_path.parent.mkdir(parents=True, exist_ok=True) # Initialize database schema self._init_schema() logger.info(f"SQLite storage initialized: {self.db_path}") def _init_schema(self): """Initialize database schema""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # OHLCV data table - stores all historical candles cursor.execute(""" CREATE TABLE IF NOT EXISTS ohlcv_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, symbol TEXT NOT NULL, timeframe TEXT NOT NULL, timestamp INTEGER NOT NULL, open REAL NOT NULL, high REAL NOT NULL, low REAL NOT NULL, close REAL NOT NULL, volume REAL NOT NULL, created_at INTEGER NOT NULL, UNIQUE(symbol, timeframe, timestamp) ) """) # Indexes for fast queries cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe ON ohlcv_data(symbol, timeframe) """) cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp ON ohlcv_data(timestamp) """) cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_ohlcv_lookup ON ohlcv_data(symbol, timeframe, timestamp) """) # Annotations table - stores complete annotation data cursor.execute(""" CREATE TABLE IF NOT EXISTS annotations ( annotation_id TEXT PRIMARY KEY, symbol TEXT NOT NULL, timeframe TEXT NOT NULL, direction TEXT NOT NULL, entry_timestamp INTEGER NOT NULL, entry_price REAL NOT NULL, exit_timestamp INTEGER NOT NULL, exit_price REAL NOT NULL, profit_loss_pct REAL NOT NULL, notes TEXT, created_at INTEGER NOT NULL, market_context TEXT, model_features TEXT, pivot_data TEXT ) """) # Annotation OHLCV snapshots - stores market data at annotation time cursor.execute(""" CREATE TABLE IF NOT EXISTS annotation_ohlcv_snapshots ( id INTEGER PRIMARY KEY AUTOINCREMENT, annotation_id TEXT NOT NULL, timeframe TEXT NOT NULL, timestamp INTEGER NOT NULL, open REAL NOT NULL, high REAL NOT NULL, low REAL NOT NULL, close REAL NOT NULL, volume REAL NOT NULL, FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id), UNIQUE(annotation_id, timeframe, timestamp) ) """) cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_annotation_snapshots ON annotation_ohlcv_snapshots(annotation_id, timeframe) """) # Model predictions table - stores model outputs at annotation time cursor.execute(""" CREATE TABLE IF NOT EXISTS annotation_model_predictions ( id INTEGER PRIMARY KEY AUTOINCREMENT, annotation_id TEXT NOT NULL, model_name TEXT NOT NULL, timestamp INTEGER NOT NULL, prediction TEXT NOT NULL, confidence REAL, features TEXT, FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id) ) """) # Cache metadata table - tracks what data we have cursor.execute(""" CREATE TABLE IF NOT EXISTS cache_metadata ( symbol TEXT NOT NULL, timeframe TEXT NOT NULL, first_timestamp INTEGER NOT NULL, last_timestamp INTEGER NOT NULL, candle_count INTEGER NOT NULL, last_update INTEGER NOT NULL, PRIMARY KEY (symbol, timeframe) ) """) conn.commit() conn.close() logger.info("SQLite schema initialized") def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int: """ Store OHLCV data in SQLite Args: symbol: Trading symbol timeframe: Timeframe df: DataFrame with OHLCV data (timestamp as index or column) Returns: Number of rows inserted """ if df is None or df.empty: return 0 try: conn = sqlite3.connect(self.db_path) # Prepare data df_copy = df.copy() # Ensure timestamp column exists if 'timestamp' not in df_copy.columns: df_copy['timestamp'] = df_copy.index # Convert timestamp to Unix milliseconds if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']): df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6 # Add metadata df_copy['symbol'] = symbol df_copy['timeframe'] = timeframe df_copy['created_at'] = int(datetime.now().timestamp() * 1000) # Select columns in correct order columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at'] df_insert = df_copy[columns] # Insert data (ignore duplicates) df_insert.to_sql('ohlcv_data', conn, if_exists='append', index=False) # Update metadata cursor = conn.cursor() cursor.execute(""" INSERT OR REPLACE INTO cache_metadata (symbol, timeframe, first_timestamp, last_timestamp, candle_count, last_update) VALUES (?, ?, COALESCE((SELECT MIN(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?), COALESCE((SELECT MAX(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?), (SELECT COUNT(*) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?) """, ( symbol, timeframe, symbol, timeframe, df_copy['timestamp'].min(), symbol, timeframe, df_copy['timestamp'].max(), symbol, timeframe, int(datetime.now().timestamp() * 1000) )) conn.commit() rows_inserted = len(df_insert) conn.close() logger.info(f"Stored {rows_inserted} candles for {symbol} {timeframe} in SQLite") return rows_inserted except Exception as e: logger.error(f"Error storing OHLCV data in SQLite: {e}") return 0 def get_ohlcv_data(self, symbol: str, timeframe: str, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: Optional[int] = None) -> Optional[pd.DataFrame]: """ Retrieve OHLCV data from SQLite Args: symbol: Trading symbol timeframe: Timeframe start_time: Start time filter end_time: End time filter limit: Maximum number of candles Returns: DataFrame with OHLCV data """ try: conn = sqlite3.connect(self.db_path) # Build query query = """ SELECT timestamp, open, high, low, close, volume FROM ohlcv_data WHERE symbol = ? AND timeframe = ? """ params = [symbol, timeframe] if start_time: query += " AND timestamp >= ?" params.append(int(start_time.timestamp() * 1000)) if end_time: query += " AND timestamp <= ?" params.append(int(end_time.timestamp() * 1000)) query += " ORDER BY timestamp DESC" if limit: query += f" LIMIT {limit}" # Execute query df = pd.read_sql_query(query, conn, params=params) conn.close() if df.empty: return None # Convert timestamp to datetime df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True) df = df.set_index('timestamp') df = df.sort_index() logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from SQLite") return df except Exception as e: logger.error(f"Error retrieving OHLCV data from SQLite: {e}") return None def store_annotation(self, annotation_id: str, annotation_data: Dict[str, Any], market_snapshots: Dict[str, pd.DataFrame], model_predictions: Optional[List[Dict]] = None) -> bool: """ Store complete annotation with market context and model data Args: annotation_id: Unique annotation ID annotation_data: Annotation metadata (entry, exit, symbol, etc.) market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data model_predictions: List of model predictions at annotation time Returns: True if successful """ try: conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Parse timestamps entry_time = annotation_data.get('entry', {}).get('timestamp') exit_time = annotation_data.get('exit', {}).get('timestamp') if isinstance(entry_time, str): entry_time = datetime.fromisoformat(entry_time.replace('Z', '+00:00')) if isinstance(exit_time, str): exit_time = datetime.fromisoformat(exit_time.replace('Z', '+00:00')) # Store annotation metadata cursor.execute(""" INSERT OR REPLACE INTO annotations (annotation_id, symbol, timeframe, direction, entry_timestamp, entry_price, exit_timestamp, exit_price, profit_loss_pct, notes, created_at, market_context, model_features, pivot_data) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( annotation_id, annotation_data.get('symbol'), annotation_data.get('timeframe'), annotation_data.get('direction'), int(entry_time.timestamp() * 1000), annotation_data.get('entry', {}).get('price'), int(exit_time.timestamp() * 1000), annotation_data.get('exit', {}).get('price'), annotation_data.get('profit_loss_pct'), annotation_data.get('notes', ''), int(datetime.now().timestamp() * 1000), json.dumps(annotation_data.get('entry_market_state', {})), json.dumps(annotation_data.get('model_features', {})), json.dumps(annotation_data.get('pivot_data', {})) )) # Store OHLCV snapshots for each timeframe for timeframe, df in market_snapshots.items(): if df is None or df.empty: continue df_copy = df.copy() # Ensure timestamp column if 'timestamp' not in df_copy.columns: df_copy['timestamp'] = df_copy.index # Convert timestamp to Unix milliseconds if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']): df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6 # Insert each candle for _, row in df_copy.iterrows(): cursor.execute(""" INSERT OR REPLACE INTO annotation_ohlcv_snapshots (annotation_id, timeframe, timestamp, open, high, low, close, volume) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, ( annotation_id, timeframe, int(row['timestamp']), float(row['open']), float(row['high']), float(row['low']), float(row['close']), float(row['volume']) )) # Store model predictions if model_predictions: for pred in model_predictions: cursor.execute(""" INSERT INTO annotation_model_predictions (annotation_id, model_name, timestamp, prediction, confidence, features) VALUES (?, ?, ?, ?, ?, ?) """, ( annotation_id, pred.get('model_name'), int(pred.get('timestamp', datetime.now().timestamp() * 1000)), json.dumps(pred.get('prediction')), pred.get('confidence'), json.dumps(pred.get('features', {})) )) conn.commit() conn.close() logger.info(f"Stored annotation {annotation_id} with {len(market_snapshots)} timeframes in SQLite") return True except Exception as e: logger.error(f"Error storing annotation in SQLite: {e}") import traceback traceback.print_exc() return False def get_annotation(self, annotation_id: str) -> Optional[Dict[str, Any]]: """ Retrieve complete annotation with all market data Args: annotation_id: Annotation ID Returns: Dict with annotation data, OHLCV snapshots, and model predictions """ try: conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Get annotation metadata cursor.execute(""" SELECT * FROM annotations WHERE annotation_id = ? """, (annotation_id,)) row = cursor.fetchone() if not row: conn.close() return None # Parse annotation data columns = [desc[0] for desc in cursor.description] annotation = dict(zip(columns, row)) # Parse JSON fields annotation['market_context'] = json.loads(annotation.get('market_context', '{}')) annotation['model_features'] = json.loads(annotation.get('model_features', '{}')) annotation['pivot_data'] = json.loads(annotation.get('pivot_data', '{}')) # Get OHLCV snapshots cursor.execute(""" SELECT timeframe, timestamp, open, high, low, close, volume FROM annotation_ohlcv_snapshots WHERE annotation_id = ? ORDER BY timeframe, timestamp """, (annotation_id,)) snapshots = {} for row in cursor.fetchall(): timeframe = row[0] if timeframe not in snapshots: snapshots[timeframe] = [] snapshots[timeframe].append({ 'timestamp': row[1], 'open': row[2], 'high': row[3], 'low': row[4], 'close': row[5], 'volume': row[6] }) # Convert to DataFrames annotation['ohlcv_snapshots'] = {} for timeframe, data in snapshots.items(): df = pd.DataFrame(data) df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True) df = df.set_index('timestamp') annotation['ohlcv_snapshots'][timeframe] = df # Get model predictions cursor.execute(""" SELECT model_name, timestamp, prediction, confidence, features FROM annotation_model_predictions WHERE annotation_id = ? """, (annotation_id,)) predictions = [] for row in cursor.fetchall(): predictions.append({ 'model_name': row[0], 'timestamp': row[1], 'prediction': json.loads(row[2]), 'confidence': row[3], 'features': json.loads(row[4]) }) annotation['model_predictions'] = predictions conn.close() logger.info(f"Retrieved annotation {annotation_id} with {len(snapshots)} timeframes from SQLite") return annotation except Exception as e: logger.error(f"Error retrieving annotation from SQLite: {e}") return None def get_cache_stats(self) -> Dict[str, Any]: """Get cache statistics""" try: conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Get OHLCV stats cursor.execute(""" SELECT symbol, timeframe, candle_count, first_timestamp, last_timestamp FROM cache_metadata ORDER BY symbol, timeframe """) ohlcv_stats = [] for row in cursor.fetchall(): ohlcv_stats.append({ 'symbol': row[0], 'timeframe': row[1], 'candle_count': row[2], 'first_timestamp': datetime.fromtimestamp(row[3] / 1000), 'last_timestamp': datetime.fromtimestamp(row[4] / 1000) }) # Get annotation count cursor.execute("SELECT COUNT(*) FROM annotations") annotation_count = cursor.fetchone()[0] # Get total OHLCV rows cursor.execute("SELECT COUNT(*) FROM ohlcv_data") total_candles = cursor.fetchone()[0] conn.close() return { 'ohlcv_stats': ohlcv_stats, 'annotation_count': annotation_count, 'total_candles': total_candles } except Exception as e: logger.error(f"Error getting cache stats: {e}") return {}