527 lines
20 KiB
Python
527 lines
20 KiB
Python
"""
|
|
SQLite Storage for Long-Term OHLCV Data and Annotation Replay
|
|
|
|
This module provides persistent storage for:
|
|
1. OHLCV data for all timeframes (unlimited history)
|
|
2. Complete annotation data with market context
|
|
3. Model predictions and features at annotation time
|
|
4. Efficient querying for historical replay
|
|
|
|
Parquet files are used for recent data (1500 candles) for speed.
|
|
SQLite is used for long-term storage and annotation replay.
|
|
"""
|
|
|
|
import sqlite3
|
|
import logging
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
import json
|
|
import pickle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SQLiteStorage:
|
|
"""SQLite storage for OHLCV data and annotations"""
|
|
|
|
def __init__(self, db_path: str = "cache/trading_data.db"):
|
|
"""Initialize SQLite storage"""
|
|
self.db_path = Path(db_path)
|
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize database schema
|
|
self._init_schema()
|
|
|
|
logger.info(f"SQLite storage initialized: {self.db_path}")
|
|
|
|
def _init_schema(self):
|
|
"""Initialize database schema"""
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# OHLCV data table - stores all historical candles
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS ohlcv_data (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
symbol TEXT NOT NULL,
|
|
timeframe TEXT NOT NULL,
|
|
timestamp INTEGER NOT NULL,
|
|
open REAL NOT NULL,
|
|
high REAL NOT NULL,
|
|
low REAL NOT NULL,
|
|
close REAL NOT NULL,
|
|
volume REAL NOT NULL,
|
|
created_at INTEGER NOT NULL,
|
|
UNIQUE(symbol, timeframe, timestamp)
|
|
)
|
|
""")
|
|
|
|
# Indexes for fast queries
|
|
cursor.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
|
|
ON ohlcv_data(symbol, timeframe)
|
|
""")
|
|
cursor.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
|
|
ON ohlcv_data(timestamp)
|
|
""")
|
|
cursor.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_ohlcv_lookup
|
|
ON ohlcv_data(symbol, timeframe, timestamp)
|
|
""")
|
|
|
|
# Annotations table - stores complete annotation data
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS annotations (
|
|
annotation_id TEXT PRIMARY KEY,
|
|
symbol TEXT NOT NULL,
|
|
timeframe TEXT NOT NULL,
|
|
direction TEXT NOT NULL,
|
|
entry_timestamp INTEGER NOT NULL,
|
|
entry_price REAL NOT NULL,
|
|
exit_timestamp INTEGER NOT NULL,
|
|
exit_price REAL NOT NULL,
|
|
profit_loss_pct REAL NOT NULL,
|
|
notes TEXT,
|
|
created_at INTEGER NOT NULL,
|
|
market_context TEXT,
|
|
model_features TEXT,
|
|
pivot_data TEXT
|
|
)
|
|
""")
|
|
|
|
# Annotation OHLCV snapshots - stores market data at annotation time
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS annotation_ohlcv_snapshots (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
annotation_id TEXT NOT NULL,
|
|
timeframe TEXT NOT NULL,
|
|
timestamp INTEGER NOT NULL,
|
|
open REAL NOT NULL,
|
|
high REAL NOT NULL,
|
|
low REAL NOT NULL,
|
|
close REAL NOT NULL,
|
|
volume REAL NOT NULL,
|
|
FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id),
|
|
UNIQUE(annotation_id, timeframe, timestamp)
|
|
)
|
|
""")
|
|
|
|
cursor.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_annotation_snapshots
|
|
ON annotation_ohlcv_snapshots(annotation_id, timeframe)
|
|
""")
|
|
|
|
# Model predictions table - stores model outputs at annotation time
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS annotation_model_predictions (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
annotation_id TEXT NOT NULL,
|
|
model_name TEXT NOT NULL,
|
|
timestamp INTEGER NOT NULL,
|
|
prediction TEXT NOT NULL,
|
|
confidence REAL,
|
|
features TEXT,
|
|
FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id)
|
|
)
|
|
""")
|
|
|
|
# Cache metadata table - tracks what data we have
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS cache_metadata (
|
|
symbol TEXT NOT NULL,
|
|
timeframe TEXT NOT NULL,
|
|
first_timestamp INTEGER NOT NULL,
|
|
last_timestamp INTEGER NOT NULL,
|
|
candle_count INTEGER NOT NULL,
|
|
last_update INTEGER NOT NULL,
|
|
PRIMARY KEY (symbol, timeframe)
|
|
)
|
|
""")
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
logger.info("SQLite schema initialized")
|
|
|
|
def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
|
|
"""
|
|
Store OHLCV data in SQLite
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Timeframe
|
|
df: DataFrame with OHLCV data (timestamp as index or column)
|
|
|
|
Returns:
|
|
Number of rows inserted
|
|
"""
|
|
if df is None or df.empty:
|
|
return 0
|
|
|
|
try:
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
# Prepare data
|
|
df_copy = df.copy()
|
|
|
|
# Ensure timestamp column exists
|
|
if 'timestamp' not in df_copy.columns:
|
|
df_copy['timestamp'] = df_copy.index
|
|
|
|
# Convert timestamp to Unix milliseconds
|
|
if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
|
|
df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
|
|
|
|
# Add metadata
|
|
df_copy['symbol'] = symbol
|
|
df_copy['timeframe'] = timeframe
|
|
df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
|
|
|
|
# Select columns in correct order
|
|
columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
|
|
df_insert = df_copy[columns]
|
|
|
|
# Insert data (ignore duplicates)
|
|
df_insert.to_sql('ohlcv_data', conn, if_exists='append', index=False)
|
|
|
|
# Update metadata
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO cache_metadata
|
|
(symbol, timeframe, first_timestamp, last_timestamp, candle_count, last_update)
|
|
VALUES (?, ?,
|
|
COALESCE((SELECT MIN(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
|
|
COALESCE((SELECT MAX(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
|
|
(SELECT COUNT(*) FROM ohlcv_data WHERE symbol=? AND timeframe=?),
|
|
?)
|
|
""", (
|
|
symbol, timeframe,
|
|
symbol, timeframe, df_copy['timestamp'].min(),
|
|
symbol, timeframe, df_copy['timestamp'].max(),
|
|
symbol, timeframe,
|
|
int(datetime.now().timestamp() * 1000)
|
|
))
|
|
|
|
conn.commit()
|
|
rows_inserted = len(df_insert)
|
|
|
|
conn.close()
|
|
|
|
logger.info(f"Stored {rows_inserted} candles for {symbol} {timeframe} in SQLite")
|
|
return rows_inserted
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error storing OHLCV data in SQLite: {e}")
|
|
return 0
|
|
|
|
def get_ohlcv_data(self, symbol: str, timeframe: str,
|
|
start_time: Optional[datetime] = None,
|
|
end_time: Optional[datetime] = None,
|
|
limit: Optional[int] = None) -> Optional[pd.DataFrame]:
|
|
"""
|
|
Retrieve OHLCV data from SQLite
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
timeframe: Timeframe
|
|
start_time: Start time filter
|
|
end_time: End time filter
|
|
limit: Maximum number of candles
|
|
|
|
Returns:
|
|
DataFrame with OHLCV data
|
|
"""
|
|
try:
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
# Build query
|
|
query = """
|
|
SELECT timestamp, open, high, low, close, volume
|
|
FROM ohlcv_data
|
|
WHERE symbol = ? AND timeframe = ?
|
|
"""
|
|
params = [symbol, timeframe]
|
|
|
|
if start_time:
|
|
query += " AND timestamp >= ?"
|
|
params.append(int(start_time.timestamp() * 1000))
|
|
|
|
if end_time:
|
|
query += " AND timestamp <= ?"
|
|
params.append(int(end_time.timestamp() * 1000))
|
|
|
|
query += " ORDER BY timestamp DESC"
|
|
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
# Execute query
|
|
df = pd.read_sql_query(query, conn, params=params)
|
|
|
|
conn.close()
|
|
|
|
if df.empty:
|
|
return None
|
|
|
|
# Convert timestamp to datetime
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
|
|
df = df.set_index('timestamp')
|
|
df = df.sort_index()
|
|
|
|
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from SQLite")
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving OHLCV data from SQLite: {e}")
|
|
return None
|
|
|
|
def store_annotation(self, annotation_id: str, annotation_data: Dict[str, Any],
|
|
market_snapshots: Dict[str, pd.DataFrame],
|
|
model_predictions: Optional[List[Dict]] = None) -> bool:
|
|
"""
|
|
Store complete annotation with market context and model data
|
|
|
|
Args:
|
|
annotation_id: Unique annotation ID
|
|
annotation_data: Annotation metadata (entry, exit, symbol, etc.)
|
|
market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data
|
|
model_predictions: List of model predictions at annotation time
|
|
|
|
Returns:
|
|
True if successful
|
|
"""
|
|
try:
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Parse timestamps
|
|
entry_time = annotation_data.get('entry', {}).get('timestamp')
|
|
exit_time = annotation_data.get('exit', {}).get('timestamp')
|
|
|
|
if isinstance(entry_time, str):
|
|
entry_time = datetime.fromisoformat(entry_time.replace('Z', '+00:00'))
|
|
if isinstance(exit_time, str):
|
|
exit_time = datetime.fromisoformat(exit_time.replace('Z', '+00:00'))
|
|
|
|
# Store annotation metadata
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO annotations
|
|
(annotation_id, symbol, timeframe, direction,
|
|
entry_timestamp, entry_price, exit_timestamp, exit_price,
|
|
profit_loss_pct, notes, created_at, market_context, model_features, pivot_data)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
annotation_id,
|
|
annotation_data.get('symbol'),
|
|
annotation_data.get('timeframe'),
|
|
annotation_data.get('direction'),
|
|
int(entry_time.timestamp() * 1000),
|
|
annotation_data.get('entry', {}).get('price'),
|
|
int(exit_time.timestamp() * 1000),
|
|
annotation_data.get('exit', {}).get('price'),
|
|
annotation_data.get('profit_loss_pct'),
|
|
annotation_data.get('notes', ''),
|
|
int(datetime.now().timestamp() * 1000),
|
|
json.dumps(annotation_data.get('entry_market_state', {})),
|
|
json.dumps(annotation_data.get('model_features', {})),
|
|
json.dumps(annotation_data.get('pivot_data', {}))
|
|
))
|
|
|
|
# Store OHLCV snapshots for each timeframe
|
|
for timeframe, df in market_snapshots.items():
|
|
if df is None or df.empty:
|
|
continue
|
|
|
|
df_copy = df.copy()
|
|
|
|
# Ensure timestamp column
|
|
if 'timestamp' not in df_copy.columns:
|
|
df_copy['timestamp'] = df_copy.index
|
|
|
|
# Convert timestamp to Unix milliseconds
|
|
if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
|
|
df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
|
|
|
|
# Insert each candle
|
|
for _, row in df_copy.iterrows():
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO annotation_ohlcv_snapshots
|
|
(annotation_id, timeframe, timestamp, open, high, low, close, volume)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
annotation_id,
|
|
timeframe,
|
|
int(row['timestamp']),
|
|
float(row['open']),
|
|
float(row['high']),
|
|
float(row['low']),
|
|
float(row['close']),
|
|
float(row['volume'])
|
|
))
|
|
|
|
# Store model predictions
|
|
if model_predictions:
|
|
for pred in model_predictions:
|
|
cursor.execute("""
|
|
INSERT INTO annotation_model_predictions
|
|
(annotation_id, model_name, timestamp, prediction, confidence, features)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
annotation_id,
|
|
pred.get('model_name'),
|
|
int(pred.get('timestamp', datetime.now().timestamp() * 1000)),
|
|
json.dumps(pred.get('prediction')),
|
|
pred.get('confidence'),
|
|
json.dumps(pred.get('features', {}))
|
|
))
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
logger.info(f"Stored annotation {annotation_id} with {len(market_snapshots)} timeframes in SQLite")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error storing annotation in SQLite: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def get_annotation(self, annotation_id: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Retrieve complete annotation with all market data
|
|
|
|
Args:
|
|
annotation_id: Annotation ID
|
|
|
|
Returns:
|
|
Dict with annotation data, OHLCV snapshots, and model predictions
|
|
"""
|
|
try:
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Get annotation metadata
|
|
cursor.execute("""
|
|
SELECT * FROM annotations WHERE annotation_id = ?
|
|
""", (annotation_id,))
|
|
|
|
row = cursor.fetchone()
|
|
if not row:
|
|
conn.close()
|
|
return None
|
|
|
|
# Parse annotation data
|
|
columns = [desc[0] for desc in cursor.description]
|
|
annotation = dict(zip(columns, row))
|
|
|
|
# Parse JSON fields
|
|
annotation['market_context'] = json.loads(annotation.get('market_context', '{}'))
|
|
annotation['model_features'] = json.loads(annotation.get('model_features', '{}'))
|
|
annotation['pivot_data'] = json.loads(annotation.get('pivot_data', '{}'))
|
|
|
|
# Get OHLCV snapshots
|
|
cursor.execute("""
|
|
SELECT timeframe, timestamp, open, high, low, close, volume
|
|
FROM annotation_ohlcv_snapshots
|
|
WHERE annotation_id = ?
|
|
ORDER BY timeframe, timestamp
|
|
""", (annotation_id,))
|
|
|
|
snapshots = {}
|
|
for row in cursor.fetchall():
|
|
timeframe = row[0]
|
|
if timeframe not in snapshots:
|
|
snapshots[timeframe] = []
|
|
|
|
snapshots[timeframe].append({
|
|
'timestamp': row[1],
|
|
'open': row[2],
|
|
'high': row[3],
|
|
'low': row[4],
|
|
'close': row[5],
|
|
'volume': row[6]
|
|
})
|
|
|
|
# Convert to DataFrames
|
|
annotation['ohlcv_snapshots'] = {}
|
|
for timeframe, data in snapshots.items():
|
|
df = pd.DataFrame(data)
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
|
|
df = df.set_index('timestamp')
|
|
annotation['ohlcv_snapshots'][timeframe] = df
|
|
|
|
# Get model predictions
|
|
cursor.execute("""
|
|
SELECT model_name, timestamp, prediction, confidence, features
|
|
FROM annotation_model_predictions
|
|
WHERE annotation_id = ?
|
|
""", (annotation_id,))
|
|
|
|
predictions = []
|
|
for row in cursor.fetchall():
|
|
predictions.append({
|
|
'model_name': row[0],
|
|
'timestamp': row[1],
|
|
'prediction': json.loads(row[2]),
|
|
'confidence': row[3],
|
|
'features': json.loads(row[4])
|
|
})
|
|
|
|
annotation['model_predictions'] = predictions
|
|
|
|
conn.close()
|
|
|
|
logger.info(f"Retrieved annotation {annotation_id} with {len(snapshots)} timeframes from SQLite")
|
|
return annotation
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error retrieving annotation from SQLite: {e}")
|
|
return None
|
|
|
|
def get_cache_stats(self) -> Dict[str, Any]:
|
|
"""Get cache statistics"""
|
|
try:
|
|
conn = sqlite3.connect(self.db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Get OHLCV stats
|
|
cursor.execute("""
|
|
SELECT symbol, timeframe, candle_count, first_timestamp, last_timestamp
|
|
FROM cache_metadata
|
|
ORDER BY symbol, timeframe
|
|
""")
|
|
|
|
ohlcv_stats = []
|
|
for row in cursor.fetchall():
|
|
ohlcv_stats.append({
|
|
'symbol': row[0],
|
|
'timeframe': row[1],
|
|
'candle_count': row[2],
|
|
'first_timestamp': datetime.fromtimestamp(row[3] / 1000),
|
|
'last_timestamp': datetime.fromtimestamp(row[4] / 1000)
|
|
})
|
|
|
|
# Get annotation count
|
|
cursor.execute("SELECT COUNT(*) FROM annotations")
|
|
annotation_count = cursor.fetchone()[0]
|
|
|
|
# Get total OHLCV rows
|
|
cursor.execute("SELECT COUNT(*) FROM ohlcv_data")
|
|
total_candles = cursor.fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
return {
|
|
'ohlcv_stats': ohlcv_stats,
|
|
'annotation_count': annotation_count,
|
|
'total_candles': total_candles
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting cache stats: {e}")
|
|
return {}
|