Files
gogo2/core/sqlite_storage.py
2025-10-24 15:57:33 +03:00

527 lines
20 KiB
Python

"""
SQLite Storage for Long-Term OHLCV Data and Annotation Replay
This module provides persistent storage for:
1. OHLCV data for all timeframes (unlimited history)
2. Complete annotation data with market context
3. Model predictions and features at annotation time
4. Efficient querying for historical replay
Parquet files are used for recent data (1500 candles) for speed.
SQLite is used for long-term storage and annotation replay.
"""
import sqlite3
import logging
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import json
import pickle
logger = logging.getLogger(__name__)
class SQLiteStorage:
"""SQLite storage for OHLCV data and annotations"""
def __init__(self, db_path: str = "cache/trading_data.db"):
"""Initialize SQLite storage"""
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Initialize database schema
self._init_schema()
logger.info(f"SQLite storage initialized: {self.db_path}")
def _init_schema(self):
"""Initialize database schema"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# OHLCV data table - stores all historical candles
cursor.execute("""
CREATE TABLE IF NOT EXISTS ohlcv_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
symbol TEXT NOT NULL,
timeframe TEXT NOT NULL,
timestamp INTEGER NOT NULL,
open REAL NOT NULL,
high REAL NOT NULL,
low REAL NOT NULL,
close REAL NOT NULL,
volume REAL NOT NULL,
created_at INTEGER NOT NULL,
UNIQUE(symbol, timeframe, timestamp)
)
""")
# Indexes for fast queries
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
ON ohlcv_data(symbol, timeframe)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
ON ohlcv_data(timestamp)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_lookup
ON ohlcv_data(symbol, timeframe, timestamp)
""")
# Annotations table - stores complete annotation data
cursor.execute("""
CREATE TABLE IF NOT EXISTS annotations (
annotation_id TEXT PRIMARY KEY,
symbol TEXT NOT NULL,
timeframe TEXT NOT NULL,
direction TEXT NOT NULL,
entry_timestamp INTEGER NOT NULL,
entry_price REAL NOT NULL,
exit_timestamp INTEGER NOT NULL,
exit_price REAL NOT NULL,
profit_loss_pct REAL NOT NULL,
notes TEXT,
created_at INTEGER NOT NULL,
market_context TEXT,
model_features TEXT,
pivot_data TEXT
)
""")
# Annotation OHLCV snapshots - stores market data at annotation time
cursor.execute("""
CREATE TABLE IF NOT EXISTS annotation_ohlcv_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
annotation_id TEXT NOT NULL,
timeframe TEXT NOT NULL,
timestamp INTEGER NOT NULL,
open REAL NOT NULL,
high REAL NOT NULL,
low REAL NOT NULL,
close REAL NOT NULL,
volume REAL NOT NULL,
FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id),
UNIQUE(annotation_id, timeframe, timestamp)
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_annotation_snapshots
ON annotation_ohlcv_snapshots(annotation_id, timeframe)
""")
# Model predictions table - stores model outputs at annotation time
cursor.execute("""
CREATE TABLE IF NOT EXISTS annotation_model_predictions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
annotation_id TEXT NOT NULL,
model_name TEXT NOT NULL,
timestamp INTEGER NOT NULL,
prediction TEXT NOT NULL,
confidence REAL,
features TEXT,
FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id)
)
""")
# Cache metadata table - tracks what data we have
cursor.execute("""
CREATE TABLE IF NOT EXISTS cache_metadata (
symbol TEXT NOT NULL,
timeframe TEXT NOT NULL,
first_timestamp INTEGER NOT NULL,
last_timestamp INTEGER NOT NULL,
candle_count INTEGER NOT NULL,
last_update INTEGER NOT NULL,
PRIMARY KEY (symbol, timeframe)
)
""")
conn.commit()
conn.close()
logger.info("SQLite schema initialized")
def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
"""
Store OHLCV data in SQLite
Args:
symbol: Trading symbol
timeframe: Timeframe
df: DataFrame with OHLCV data (timestamp as index or column)
Returns:
Number of rows inserted
"""
if df is None or df.empty:
return 0
try:
conn = sqlite3.connect(self.db_path)
# Prepare data
df_copy = df.copy()
# Ensure timestamp column exists
if 'timestamp' not in df_copy.columns:
df_copy['timestamp'] = df_copy.index
# Convert timestamp to Unix milliseconds
if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
# Add metadata
df_copy['symbol'] = symbol
df_copy['timeframe'] = timeframe
df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
# Select columns in correct order
columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
df_insert = df_copy[columns]
# Insert data (ignore duplicates)
df_insert.to_sql('ohlcv_data', conn, if_exists='append', index=False)
# Update metadata
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO cache_metadata
(symbol, timeframe, first_timestamp, last_timestamp, candle_count, last_update)
VALUES (?, ?,
COALESCE((SELECT MIN(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
COALESCE((SELECT MAX(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
(SELECT COUNT(*) FROM ohlcv_data WHERE symbol=? AND timeframe=?),
?)
""", (
symbol, timeframe,
symbol, timeframe, df_copy['timestamp'].min(),
symbol, timeframe, df_copy['timestamp'].max(),
symbol, timeframe,
int(datetime.now().timestamp() * 1000)
))
conn.commit()
rows_inserted = len(df_insert)
conn.close()
logger.info(f"Stored {rows_inserted} candles for {symbol} {timeframe} in SQLite")
return rows_inserted
except Exception as e:
logger.error(f"Error storing OHLCV data in SQLite: {e}")
return 0
def get_ohlcv_data(self, symbol: str, timeframe: str,
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
limit: Optional[int] = None) -> Optional[pd.DataFrame]:
"""
Retrieve OHLCV data from SQLite
Args:
symbol: Trading symbol
timeframe: Timeframe
start_time: Start time filter
end_time: End time filter
limit: Maximum number of candles
Returns:
DataFrame with OHLCV data
"""
try:
conn = sqlite3.connect(self.db_path)
# Build query
query = """
SELECT timestamp, open, high, low, close, volume
FROM ohlcv_data
WHERE symbol = ? AND timeframe = ?
"""
params = [symbol, timeframe]
if start_time:
query += " AND timestamp >= ?"
params.append(int(start_time.timestamp() * 1000))
if end_time:
query += " AND timestamp <= ?"
params.append(int(end_time.timestamp() * 1000))
query += " ORDER BY timestamp DESC"
if limit:
query += f" LIMIT {limit}"
# Execute query
df = pd.read_sql_query(query, conn, params=params)
conn.close()
if df.empty:
return None
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
df = df.set_index('timestamp')
df = df.sort_index()
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from SQLite")
return df
except Exception as e:
logger.error(f"Error retrieving OHLCV data from SQLite: {e}")
return None
def store_annotation(self, annotation_id: str, annotation_data: Dict[str, Any],
market_snapshots: Dict[str, pd.DataFrame],
model_predictions: Optional[List[Dict]] = None) -> bool:
"""
Store complete annotation with market context and model data
Args:
annotation_id: Unique annotation ID
annotation_data: Annotation metadata (entry, exit, symbol, etc.)
market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data
model_predictions: List of model predictions at annotation time
Returns:
True if successful
"""
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# Parse timestamps
entry_time = annotation_data.get('entry', {}).get('timestamp')
exit_time = annotation_data.get('exit', {}).get('timestamp')
if isinstance(entry_time, str):
entry_time = datetime.fromisoformat(entry_time.replace('Z', '+00:00'))
if isinstance(exit_time, str):
exit_time = datetime.fromisoformat(exit_time.replace('Z', '+00:00'))
# Store annotation metadata
cursor.execute("""
INSERT OR REPLACE INTO annotations
(annotation_id, symbol, timeframe, direction,
entry_timestamp, entry_price, exit_timestamp, exit_price,
profit_loss_pct, notes, created_at, market_context, model_features, pivot_data)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
annotation_id,
annotation_data.get('symbol'),
annotation_data.get('timeframe'),
annotation_data.get('direction'),
int(entry_time.timestamp() * 1000),
annotation_data.get('entry', {}).get('price'),
int(exit_time.timestamp() * 1000),
annotation_data.get('exit', {}).get('price'),
annotation_data.get('profit_loss_pct'),
annotation_data.get('notes', ''),
int(datetime.now().timestamp() * 1000),
json.dumps(annotation_data.get('entry_market_state', {})),
json.dumps(annotation_data.get('model_features', {})),
json.dumps(annotation_data.get('pivot_data', {}))
))
# Store OHLCV snapshots for each timeframe
for timeframe, df in market_snapshots.items():
if df is None or df.empty:
continue
df_copy = df.copy()
# Ensure timestamp column
if 'timestamp' not in df_copy.columns:
df_copy['timestamp'] = df_copy.index
# Convert timestamp to Unix milliseconds
if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
# Insert each candle
for _, row in df_copy.iterrows():
cursor.execute("""
INSERT OR REPLACE INTO annotation_ohlcv_snapshots
(annotation_id, timeframe, timestamp, open, high, low, close, volume)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
annotation_id,
timeframe,
int(row['timestamp']),
float(row['open']),
float(row['high']),
float(row['low']),
float(row['close']),
float(row['volume'])
))
# Store model predictions
if model_predictions:
for pred in model_predictions:
cursor.execute("""
INSERT INTO annotation_model_predictions
(annotation_id, model_name, timestamp, prediction, confidence, features)
VALUES (?, ?, ?, ?, ?, ?)
""", (
annotation_id,
pred.get('model_name'),
int(pred.get('timestamp', datetime.now().timestamp() * 1000)),
json.dumps(pred.get('prediction')),
pred.get('confidence'),
json.dumps(pred.get('features', {}))
))
conn.commit()
conn.close()
logger.info(f"Stored annotation {annotation_id} with {len(market_snapshots)} timeframes in SQLite")
return True
except Exception as e:
logger.error(f"Error storing annotation in SQLite: {e}")
import traceback
traceback.print_exc()
return False
def get_annotation(self, annotation_id: str) -> Optional[Dict[str, Any]]:
"""
Retrieve complete annotation with all market data
Args:
annotation_id: Annotation ID
Returns:
Dict with annotation data, OHLCV snapshots, and model predictions
"""
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# Get annotation metadata
cursor.execute("""
SELECT * FROM annotations WHERE annotation_id = ?
""", (annotation_id,))
row = cursor.fetchone()
if not row:
conn.close()
return None
# Parse annotation data
columns = [desc[0] for desc in cursor.description]
annotation = dict(zip(columns, row))
# Parse JSON fields
annotation['market_context'] = json.loads(annotation.get('market_context', '{}'))
annotation['model_features'] = json.loads(annotation.get('model_features', '{}'))
annotation['pivot_data'] = json.loads(annotation.get('pivot_data', '{}'))
# Get OHLCV snapshots
cursor.execute("""
SELECT timeframe, timestamp, open, high, low, close, volume
FROM annotation_ohlcv_snapshots
WHERE annotation_id = ?
ORDER BY timeframe, timestamp
""", (annotation_id,))
snapshots = {}
for row in cursor.fetchall():
timeframe = row[0]
if timeframe not in snapshots:
snapshots[timeframe] = []
snapshots[timeframe].append({
'timestamp': row[1],
'open': row[2],
'high': row[3],
'low': row[4],
'close': row[5],
'volume': row[6]
})
# Convert to DataFrames
annotation['ohlcv_snapshots'] = {}
for timeframe, data in snapshots.items():
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
df = df.set_index('timestamp')
annotation['ohlcv_snapshots'][timeframe] = df
# Get model predictions
cursor.execute("""
SELECT model_name, timestamp, prediction, confidence, features
FROM annotation_model_predictions
WHERE annotation_id = ?
""", (annotation_id,))
predictions = []
for row in cursor.fetchall():
predictions.append({
'model_name': row[0],
'timestamp': row[1],
'prediction': json.loads(row[2]),
'confidence': row[3],
'features': json.loads(row[4])
})
annotation['model_predictions'] = predictions
conn.close()
logger.info(f"Retrieved annotation {annotation_id} with {len(snapshots)} timeframes from SQLite")
return annotation
except Exception as e:
logger.error(f"Error retrieving annotation from SQLite: {e}")
return None
def get_cache_stats(self) -> Dict[str, Any]:
"""Get cache statistics"""
try:
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# Get OHLCV stats
cursor.execute("""
SELECT symbol, timeframe, candle_count, first_timestamp, last_timestamp
FROM cache_metadata
ORDER BY symbol, timeframe
""")
ohlcv_stats = []
for row in cursor.fetchall():
ohlcv_stats.append({
'symbol': row[0],
'timeframe': row[1],
'candle_count': row[2],
'first_timestamp': datetime.fromtimestamp(row[3] / 1000),
'last_timestamp': datetime.fromtimestamp(row[4] / 1000)
})
# Get annotation count
cursor.execute("SELECT COUNT(*) FROM annotations")
annotation_count = cursor.fetchone()[0]
# Get total OHLCV rows
cursor.execute("SELECT COUNT(*) FROM ohlcv_data")
total_candles = cursor.fetchone()[0]
conn.close()
return {
'ohlcv_stats': ohlcv_stats,
'annotation_count': annotation_count,
'total_candles': total_candles
}
except Exception as e:
logger.error(f"Error getting cache stats: {e}")
return {}