gogo2/core/sqlite_storage.py

"""
SQLite Storage for Long-Term OHLCV Data and Annotation Replay

This module provides persistent storage for:
1. OHLCV data for all timeframes (unlimited history)
2. Complete annotation data with market context
3. Model predictions and features at annotation time
4. Efficient querying for historical replay

Parquet files are used for recent data (1500 candles) for speed.
SQLite is used for long-term storage and annotation replay.
"""

import sqlite3
import logging
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import json
import pickle

logger = logging.getLogger(__name__)


class SQLiteStorage:
    """SQLite storage for OHLCV data and annotations"""

    def __init__(self, db_path: str = "cache/trading_data.db"):
        """Initialize SQLite storage"""
        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)

        # Initialize database schema
        self._init_schema()

        logger.info(f"SQLite storage initialized: {self.db_path}")

    def _init_schema(self):
        """Initialize database schema"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        # OHLCV data table - stores all historical candles
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS ohlcv_data (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                symbol TEXT NOT NULL,
                timeframe TEXT NOT NULL,
                timestamp INTEGER NOT NULL,
                open REAL NOT NULL,
                high REAL NOT NULL,
                low REAL NOT NULL,
                close REAL NOT NULL,
                volume REAL NOT NULL,
                created_at INTEGER NOT NULL,
                UNIQUE(symbol, timeframe, timestamp)
            )
        """)

        # Indexes for fast queries
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
            ON ohlcv_data(symbol, timeframe)
        """)
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
            ON ohlcv_data(timestamp)
        """)
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_ohlcv_lookup
            ON ohlcv_data(symbol, timeframe, timestamp)
        """)

        # Annotations table - stores complete annotation data
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS annotations (
                annotation_id TEXT PRIMARY KEY,
                symbol TEXT NOT NULL,
                timeframe TEXT NOT NULL,
                direction TEXT NOT NULL,
                entry_timestamp INTEGER NOT NULL,
                entry_price REAL NOT NULL,
                exit_timestamp INTEGER NOT NULL,
                exit_price REAL NOT NULL,
                profit_loss_pct REAL NOT NULL,
                notes TEXT,
                created_at INTEGER NOT NULL,
                market_context TEXT,
                model_features TEXT,
                pivot_data TEXT
            )
        """)

        # Annotation OHLCV snapshots - stores market data at annotation time
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS annotation_ohlcv_snapshots (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                annotation_id TEXT NOT NULL,
                timeframe TEXT NOT NULL,
                timestamp INTEGER NOT NULL,
                open REAL NOT NULL,
                high REAL NOT NULL,
                low REAL NOT NULL,
                close REAL NOT NULL,
                volume REAL NOT NULL,
                FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id),
                UNIQUE(annotation_id, timeframe, timestamp)
            )
        """)

        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_annotation_snapshots
            ON annotation_ohlcv_snapshots(annotation_id, timeframe)
        """)

        # Model predictions table - stores model outputs at annotation time
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS annotation_model_predictions (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                annotation_id TEXT NOT NULL,
                model_name TEXT NOT NULL,
                timestamp INTEGER NOT NULL,
                prediction TEXT NOT NULL,
                confidence REAL,
                features TEXT,
                FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id)
            )
        """)

        # Cache metadata table - tracks what data we have
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS cache_metadata (
                symbol TEXT NOT NULL,
                timeframe TEXT NOT NULL,
                first_timestamp INTEGER NOT NULL,
                last_timestamp INTEGER NOT NULL,
                candle_count INTEGER NOT NULL,
                last_update INTEGER NOT NULL,
                PRIMARY KEY (symbol, timeframe)
            )
        """)

        conn.commit()
        conn.close()

        logger.info("SQLite schema initialized")

    def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
        """
        Store OHLCV data in SQLite

        Args:
            symbol: Trading symbol
            timeframe: Timeframe
            df: DataFrame with OHLCV data (timestamp as index or column)

        Returns:
            Number of rows inserted
        """
        if df is None or df.empty:
            return 0

        try:
            conn = sqlite3.connect(self.db_path)

            # Prepare data
            df_copy = df.copy()

            # Ensure timestamp column exists
            if 'timestamp' not in df_copy.columns:
                df_copy['timestamp'] = df_copy.index

            # Convert timestamp to Unix milliseconds
            if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
                df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6

            # Add metadata
            df_copy['symbol'] = symbol
            df_copy['timeframe'] = timeframe
            df_copy['created_at'] = int(datetime.now().timestamp() * 1000)

            # Select columns in correct order
            columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
            df_insert = df_copy[columns]

            # Insert data (ignore duplicates)
            df_insert.to_sql('ohlcv_data', conn, if_exists='append', index=False)

            # Update metadata
            cursor = conn.cursor()
            cursor.execute("""
                INSERT OR REPLACE INTO cache_metadata
                (symbol, timeframe, first_timestamp, last_timestamp, candle_count, last_update)
                VALUES (?, ?,
                    COALESCE((SELECT MIN(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
                    COALESCE((SELECT MAX(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
                    (SELECT COUNT(*) FROM ohlcv_data WHERE symbol=? AND timeframe=?),
                    ?)
            """, (
                symbol, timeframe,
                symbol, timeframe, df_copy['timestamp'].min(),
                symbol, timeframe, df_copy['timestamp'].max(),
                symbol, timeframe,
                int(datetime.now().timestamp() * 1000)
            ))

            conn.commit()
            rows_inserted = len(df_insert)

            conn.close()

            logger.info(f"Stored {rows_inserted} candles for {symbol} {timeframe} in SQLite")
            return rows_inserted

        except Exception as e:
            logger.error(f"Error storing OHLCV data in SQLite: {e}")
            return 0

    def get_ohlcv_data(self, symbol: str, timeframe: str,
                       start_time: Optional[datetime] = None,
                       end_time: Optional[datetime] = None,
                       limit: Optional[int] = None) -> Optional[pd.DataFrame]:
        """
        Retrieve OHLCV data from SQLite

        Args:
            symbol: Trading symbol
            timeframe: Timeframe
            start_time: Start time filter
            end_time: End time filter
            limit: Maximum number of candles

        Returns:
            DataFrame with OHLCV data
        """
        try:
            conn = sqlite3.connect(self.db_path)

            # Build query
            query = """
                SELECT timestamp, open, high, low, close, volume
                FROM ohlcv_data
                WHERE symbol = ? AND timeframe = ?
            """
            params = [symbol, timeframe]

            if start_time:
                query += " AND timestamp >= ?"
                params.append(int(start_time.timestamp() * 1000))

            if end_time:
                query += " AND timestamp <= ?"
                params.append(int(end_time.timestamp() * 1000))

            query += " ORDER BY timestamp DESC"

            if limit:
                query += f" LIMIT {limit}"

            # Execute query
            df = pd.read_sql_query(query, conn, params=params)

            conn.close()

            if df.empty:
                return None

            # Convert timestamp to datetime
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
            df = df.set_index('timestamp')
            df = df.sort_index()

            logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from SQLite")
            return df

        except Exception as e:
            logger.error(f"Error retrieving OHLCV data from SQLite: {e}")
            return None

    def store_annotation(self, annotation_id: str, annotation_data: Dict[str, Any],
                        market_snapshots: Dict[str, pd.DataFrame],
                        model_predictions: Optional[List[Dict]] = None) -> bool:
        """
        Store complete annotation with market context and model data

        Args:
            annotation_id: Unique annotation ID
            annotation_data: Annotation metadata (entry, exit, symbol, etc.)
            market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data
            model_predictions: List of model predictions at annotation time

        Returns:
            True if successful
        """
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            # Parse timestamps
            entry_time = annotation_data.get('entry', {}).get('timestamp')
            exit_time = annotation_data.get('exit', {}).get('timestamp')

            if isinstance(entry_time, str):
                entry_time = datetime.fromisoformat(entry_time.replace('Z', '+00:00'))
            if isinstance(exit_time, str):
                exit_time = datetime.fromisoformat(exit_time.replace('Z', '+00:00'))

            # Store annotation metadata
            cursor.execute("""
                INSERT OR REPLACE INTO annotations
                (annotation_id, symbol, timeframe, direction,
                 entry_timestamp, entry_price, exit_timestamp, exit_price,
                 profit_loss_pct, notes, created_at, market_context, model_features, pivot_data)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                annotation_id,
                annotation_data.get('symbol'),
                annotation_data.get('timeframe'),
                annotation_data.get('direction'),
                int(entry_time.timestamp() * 1000),
                annotation_data.get('entry', {}).get('price'),
                int(exit_time.timestamp() * 1000),
                annotation_data.get('exit', {}).get('price'),
                annotation_data.get('profit_loss_pct'),
                annotation_data.get('notes', ''),
                int(datetime.now().timestamp() * 1000),
                json.dumps(annotation_data.get('entry_market_state', {})),
                json.dumps(annotation_data.get('model_features', {})),
                json.dumps(annotation_data.get('pivot_data', {}))
            ))

            # Store OHLCV snapshots for each timeframe
            for timeframe, df in market_snapshots.items():
                if df is None or df.empty:
                    continue

                df_copy = df.copy()

                # Ensure timestamp column
                if 'timestamp' not in df_copy.columns:
                    df_copy['timestamp'] = df_copy.index

                # Convert timestamp to Unix milliseconds
                if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
                    df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6

                # Insert each candle
                for _, row in df_copy.iterrows():
                    cursor.execute("""
                        INSERT OR REPLACE INTO annotation_ohlcv_snapshots
                        (annotation_id, timeframe, timestamp, open, high, low, close, volume)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    """, (
                        annotation_id,
                        timeframe,
                        int(row['timestamp']),
                        float(row['open']),
                        float(row['high']),
                        float(row['low']),
                        float(row['close']),
                        float(row['volume'])
                    ))

            # Store model predictions
            if model_predictions:
                for pred in model_predictions:
                    cursor.execute("""
                        INSERT INTO annotation_model_predictions
                        (annotation_id, model_name, timestamp, prediction, confidence, features)
                        VALUES (?, ?, ?, ?, ?, ?)
                    """, (
                        annotation_id,
                        pred.get('model_name'),
                        int(pred.get('timestamp', datetime.now().timestamp() * 1000)),
                        json.dumps(pred.get('prediction')),
                        pred.get('confidence'),
                        json.dumps(pred.get('features', {}))
                    ))

            conn.commit()
            conn.close()

            logger.info(f"Stored annotation {annotation_id} with {len(market_snapshots)} timeframes in SQLite")
            return True

        except Exception as e:
            logger.error(f"Error storing annotation in SQLite: {e}")
            import traceback
            traceback.print_exc()
            return False

    def get_annotation(self, annotation_id: str) -> Optional[Dict[str, Any]]:
        """
        Retrieve complete annotation with all market data

        Args:
            annotation_id: Annotation ID

        Returns:
            Dict with annotation data, OHLCV snapshots, and model predictions
        """
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            # Get annotation metadata
            cursor.execute("""
                SELECT * FROM annotations WHERE annotation_id = ?
            """, (annotation_id,))

            row = cursor.fetchone()
            if not row:
                conn.close()
                return None

            # Parse annotation data
            columns = [desc[0] for desc in cursor.description]
            annotation = dict(zip(columns, row))

            # Parse JSON fields
            annotation['market_context'] = json.loads(annotation.get('market_context', '{}'))
            annotation['model_features'] = json.loads(annotation.get('model_features', '{}'))
            annotation['pivot_data'] = json.loads(annotation.get('pivot_data', '{}'))

            # Get OHLCV snapshots
            cursor.execute("""
                SELECT timeframe, timestamp, open, high, low, close, volume
                FROM annotation_ohlcv_snapshots
                WHERE annotation_id = ?
                ORDER BY timeframe, timestamp
            """, (annotation_id,))

            snapshots = {}
            for row in cursor.fetchall():
                timeframe = row[0]
                if timeframe not in snapshots:
                    snapshots[timeframe] = []

                snapshots[timeframe].append({
                    'timestamp': row[1],
                    'open': row[2],
                    'high': row[3],
                    'low': row[4],
                    'close': row[5],
                    'volume': row[6]
                })

            # Convert to DataFrames
            annotation['ohlcv_snapshots'] = {}
            for timeframe, data in snapshots.items():
                df = pd.DataFrame(data)
                df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
                df = df.set_index('timestamp')
                annotation['ohlcv_snapshots'][timeframe] = df

            # Get model predictions
            cursor.execute("""
                SELECT model_name, timestamp, prediction, confidence, features
                FROM annotation_model_predictions
                WHERE annotation_id = ?
            """, (annotation_id,))

            predictions = []
            for row in cursor.fetchall():
                predictions.append({
                    'model_name': row[0],
                    'timestamp': row[1],
                    'prediction': json.loads(row[2]),
                    'confidence': row[3],
                    'features': json.loads(row[4])
                })

            annotation['model_predictions'] = predictions

            conn.close()

            logger.info(f"Retrieved annotation {annotation_id} with {len(snapshots)} timeframes from SQLite")
            return annotation

        except Exception as e:
            logger.error(f"Error retrieving annotation from SQLite: {e}")
            return None

    def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            # Get OHLCV stats
            cursor.execute("""
                SELECT symbol, timeframe, candle_count, first_timestamp, last_timestamp
                FROM cache_metadata
                ORDER BY symbol, timeframe
            """)

            ohlcv_stats = []
            for row in cursor.fetchall():
                ohlcv_stats.append({
                    'symbol': row[0],
                    'timeframe': row[1],
                    'candle_count': row[2],
                    'first_timestamp': datetime.fromtimestamp(row[3] / 1000),
                    'last_timestamp': datetime.fromtimestamp(row[4] / 1000)
                })

            # Get annotation count
            cursor.execute("SELECT COUNT(*) FROM annotations")
            annotation_count = cursor.fetchone()[0]

            # Get total OHLCV rows
            cursor.execute("SELECT COUNT(*) FROM ohlcv_data")
            total_candles = cursor.fetchone()[0]

            conn.close()

            return {
                'ohlcv_stats': ohlcv_stats,
                'annotation_count': annotation_count,
                'total_candles': total_candles
            }

        except Exception as e:
            logger.error(f"Error getting cache stats: {e}")
            return {}