WIP oclcv in storage. migrate do duckdb

2025-10-24 15:57:33 +03:00
parent d4ed894a92
commit 809c32e7a8
7 changed files with 1417 additions and 92 deletions
--- a/core/sqlite_storage.py
+++ b/core/sqlite_storage.py
@@ -0,0 +1,526 @@
+"""
+SQLite Storage for Long-Term OHLCV Data and Annotation Replay
+
+This module provides persistent storage for:
+1. OHLCV data for all timeframes (unlimited history)
+2. Complete annotation data with market context
+3. Model predictions and features at annotation time
+4. Efficient querying for historical replay
+
+Parquet files are used for recent data (1500 candles) for speed.
+SQLite is used for long-term storage and annotation replay.
+"""
+
+import sqlite3
+import logging
+import pandas as pd
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+import json
+import pickle
+
+logger = logging.getLogger(__name__)
+
+
+class SQLiteStorage:
+    """SQLite storage for OHLCV data and annotations"""
+    
+    def __init__(self, db_path: str = "cache/trading_data.db"):
+        """Initialize SQLite storage"""
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Initialize database schema
+        self._init_schema()
+        
+        logger.info(f"SQLite storage initialized: {self.db_path}")
+    
+    def _init_schema(self):
+        """Initialize database schema"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # OHLCV data table - stores all historical candles
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS ohlcv_data (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                symbol TEXT NOT NULL,
+                timeframe TEXT NOT NULL,
+                timestamp INTEGER NOT NULL,
+                open REAL NOT NULL,
+                high REAL NOT NULL,
+                low REAL NOT NULL,
+                close REAL NOT NULL,
+                volume REAL NOT NULL,
+                created_at INTEGER NOT NULL,
+                UNIQUE(symbol, timeframe, timestamp)
+            )
+        """)
+        
+        # Indexes for fast queries
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe 
+            ON ohlcv_data(symbol, timeframe)
+        """)
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp 
+            ON ohlcv_data(timestamp)
+        """)
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_ohlcv_lookup 
+            ON ohlcv_data(symbol, timeframe, timestamp)
+        """)
+        
+        # Annotations table - stores complete annotation data
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS annotations (
+                annotation_id TEXT PRIMARY KEY,
+                symbol TEXT NOT NULL,
+                timeframe TEXT NOT NULL,
+                direction TEXT NOT NULL,
+                entry_timestamp INTEGER NOT NULL,
+                entry_price REAL NOT NULL,
+                exit_timestamp INTEGER NOT NULL,
+                exit_price REAL NOT NULL,
+                profit_loss_pct REAL NOT NULL,
+                notes TEXT,
+                created_at INTEGER NOT NULL,
+                market_context TEXT,
+                model_features TEXT,
+                pivot_data TEXT
+            )
+        """)
+        
+        # Annotation OHLCV snapshots - stores market data at annotation time
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS annotation_ohlcv_snapshots (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                annotation_id TEXT NOT NULL,
+                timeframe TEXT NOT NULL,
+                timestamp INTEGER NOT NULL,
+                open REAL NOT NULL,
+                high REAL NOT NULL,
+                low REAL NOT NULL,
+                close REAL NOT NULL,
+                volume REAL NOT NULL,
+                FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id),
+                UNIQUE(annotation_id, timeframe, timestamp)
+            )
+        """)
+        
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_annotation_snapshots 
+            ON annotation_ohlcv_snapshots(annotation_id, timeframe)
+        """)
+        
+        # Model predictions table - stores model outputs at annotation time
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS annotation_model_predictions (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                annotation_id TEXT NOT NULL,
+                model_name TEXT NOT NULL,
+                timestamp INTEGER NOT NULL,
+                prediction TEXT NOT NULL,
+                confidence REAL,
+                features TEXT,
+                FOREIGN KEY (annotation_id) REFERENCES annotations(annotation_id)
+            )
+        """)
+        
+        # Cache metadata table - tracks what data we have
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS cache_metadata (
+                symbol TEXT NOT NULL,
+                timeframe TEXT NOT NULL,
+                first_timestamp INTEGER NOT NULL,
+                last_timestamp INTEGER NOT NULL,
+                candle_count INTEGER NOT NULL,
+                last_update INTEGER NOT NULL,
+                PRIMARY KEY (symbol, timeframe)
+            )
+        """)
+        
+        conn.commit()
+        conn.close()
+        
+        logger.info("SQLite schema initialized")
+    
+    def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
+        """
+        Store OHLCV data in SQLite
+        
+        Args:
+            symbol: Trading symbol
+            timeframe: Timeframe
+            df: DataFrame with OHLCV data (timestamp as index or column)
+            
+        Returns:
+            Number of rows inserted
+        """
+        if df is None or df.empty:
+            return 0
+        
+        try:
+            conn = sqlite3.connect(self.db_path)
+            
+            # Prepare data
+            df_copy = df.copy()
+            
+            # Ensure timestamp column exists
+            if 'timestamp' not in df_copy.columns:
+                df_copy['timestamp'] = df_copy.index
+            
+            # Convert timestamp to Unix milliseconds
+            if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
+                df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
+            
+            # Add metadata
+            df_copy['symbol'] = symbol
+            df_copy['timeframe'] = timeframe
+            df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
+            
+            # Select columns in correct order
+            columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
+            df_insert = df_copy[columns]
+            
+            # Insert data (ignore duplicates)
+            df_insert.to_sql('ohlcv_data', conn, if_exists='append', index=False)
+            
+            # Update metadata
+            cursor = conn.cursor()
+            cursor.execute("""
+                INSERT OR REPLACE INTO cache_metadata 
+                (symbol, timeframe, first_timestamp, last_timestamp, candle_count, last_update)
+                VALUES (?, ?, 
+                    COALESCE((SELECT MIN(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
+                    COALESCE((SELECT MAX(timestamp) FROM ohlcv_data WHERE symbol=? AND timeframe=?), ?),
+                    (SELECT COUNT(*) FROM ohlcv_data WHERE symbol=? AND timeframe=?),
+                    ?)
+            """, (
+                symbol, timeframe,
+                symbol, timeframe, df_copy['timestamp'].min(),
+                symbol, timeframe, df_copy['timestamp'].max(),
+                symbol, timeframe,
+                int(datetime.now().timestamp() * 1000)
+            ))
+            
+            conn.commit()
+            rows_inserted = len(df_insert)
+            
+            conn.close()
+            
+            logger.info(f"Stored {rows_inserted} candles for {symbol} {timeframe} in SQLite")
+            return rows_inserted
+            
+        except Exception as e:
+            logger.error(f"Error storing OHLCV data in SQLite: {e}")
+            return 0
+    
+    def get_ohlcv_data(self, symbol: str, timeframe: str, 
+                       start_time: Optional[datetime] = None,
+                       end_time: Optional[datetime] = None,
+                       limit: Optional[int] = None) -> Optional[pd.DataFrame]:
+        """
+        Retrieve OHLCV data from SQLite
+        
+        Args:
+            symbol: Trading symbol
+            timeframe: Timeframe
+            start_time: Start time filter
+            end_time: End time filter
+            limit: Maximum number of candles
+            
+        Returns:
+            DataFrame with OHLCV data
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            
+            # Build query
+            query = """
+                SELECT timestamp, open, high, low, close, volume
+                FROM ohlcv_data
+                WHERE symbol = ? AND timeframe = ?
+            """
+            params = [symbol, timeframe]
+            
+            if start_time:
+                query += " AND timestamp >= ?"
+                params.append(int(start_time.timestamp() * 1000))
+            
+            if end_time:
+                query += " AND timestamp <= ?"
+                params.append(int(end_time.timestamp() * 1000))
+            
+            query += " ORDER BY timestamp DESC"
+            
+            if limit:
+                query += f" LIMIT {limit}"
+            
+            # Execute query
+            df = pd.read_sql_query(query, conn, params=params)
+            
+            conn.close()
+            
+            if df.empty:
+                return None
+            
+            # Convert timestamp to datetime
+            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
+            df = df.set_index('timestamp')
+            df = df.sort_index()
+            
+            logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from SQLite")
+            return df
+            
+        except Exception as e:
+            logger.error(f"Error retrieving OHLCV data from SQLite: {e}")
+            return None
+    
+    def store_annotation(self, annotation_id: str, annotation_data: Dict[str, Any],
+                        market_snapshots: Dict[str, pd.DataFrame],
+                        model_predictions: Optional[List[Dict]] = None) -> bool:
+        """
+        Store complete annotation with market context and model data
+        
+        Args:
+            annotation_id: Unique annotation ID
+            annotation_data: Annotation metadata (entry, exit, symbol, etc.)
+            market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data
+            model_predictions: List of model predictions at annotation time
+            
+        Returns:
+            True if successful
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            # Parse timestamps
+            entry_time = annotation_data.get('entry', {}).get('timestamp')
+            exit_time = annotation_data.get('exit', {}).get('timestamp')
+            
+            if isinstance(entry_time, str):
+                entry_time = datetime.fromisoformat(entry_time.replace('Z', '+00:00'))
+            if isinstance(exit_time, str):
+                exit_time = datetime.fromisoformat(exit_time.replace('Z', '+00:00'))
+            
+            # Store annotation metadata
+            cursor.execute("""
+                INSERT OR REPLACE INTO annotations
+                (annotation_id, symbol, timeframe, direction, 
+                 entry_timestamp, entry_price, exit_timestamp, exit_price,
+                 profit_loss_pct, notes, created_at, market_context, model_features, pivot_data)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                annotation_id,
+                annotation_data.get('symbol'),
+                annotation_data.get('timeframe'),
+                annotation_data.get('direction'),
+                int(entry_time.timestamp() * 1000),
+                annotation_data.get('entry', {}).get('price'),
+                int(exit_time.timestamp() * 1000),
+                annotation_data.get('exit', {}).get('price'),
+                annotation_data.get('profit_loss_pct'),
+                annotation_data.get('notes', ''),
+                int(datetime.now().timestamp() * 1000),
+                json.dumps(annotation_data.get('entry_market_state', {})),
+                json.dumps(annotation_data.get('model_features', {})),
+                json.dumps(annotation_data.get('pivot_data', {}))
+            ))
+            
+            # Store OHLCV snapshots for each timeframe
+            for timeframe, df in market_snapshots.items():
+                if df is None or df.empty:
+                    continue
+                
+                df_copy = df.copy()
+                
+                # Ensure timestamp column
+                if 'timestamp' not in df_copy.columns:
+                    df_copy['timestamp'] = df_copy.index
+                
+                # Convert timestamp to Unix milliseconds
+                if pd.api.types.is_datetime64_any_dtype(df_copy['timestamp']):
+                    df_copy['timestamp'] = df_copy['timestamp'].astype('int64') // 10**6
+                
+                # Insert each candle
+                for _, row in df_copy.iterrows():
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO annotation_ohlcv_snapshots
+                        (annotation_id, timeframe, timestamp, open, high, low, close, volume)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                    """, (
+                        annotation_id,
+                        timeframe,
+                        int(row['timestamp']),
+                        float(row['open']),
+                        float(row['high']),
+                        float(row['low']),
+                        float(row['close']),
+                        float(row['volume'])
+                    ))
+            
+            # Store model predictions
+            if model_predictions:
+                for pred in model_predictions:
+                    cursor.execute("""
+                        INSERT INTO annotation_model_predictions
+                        (annotation_id, model_name, timestamp, prediction, confidence, features)
+                        VALUES (?, ?, ?, ?, ?, ?)
+                    """, (
+                        annotation_id,
+                        pred.get('model_name'),
+                        int(pred.get('timestamp', datetime.now().timestamp() * 1000)),
+                        json.dumps(pred.get('prediction')),
+                        pred.get('confidence'),
+                        json.dumps(pred.get('features', {}))
+                    ))
+            
+            conn.commit()
+            conn.close()
+            
+            logger.info(f"Stored annotation {annotation_id} with {len(market_snapshots)} timeframes in SQLite")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error storing annotation in SQLite: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+    
+    def get_annotation(self, annotation_id: str) -> Optional[Dict[str, Any]]:
+        """
+        Retrieve complete annotation with all market data
+        
+        Args:
+            annotation_id: Annotation ID
+            
+        Returns:
+            Dict with annotation data, OHLCV snapshots, and model predictions
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            # Get annotation metadata
+            cursor.execute("""
+                SELECT * FROM annotations WHERE annotation_id = ?
+            """, (annotation_id,))
+            
+            row = cursor.fetchone()
+            if not row:
+                conn.close()
+                return None
+            
+            # Parse annotation data
+            columns = [desc[0] for desc in cursor.description]
+            annotation = dict(zip(columns, row))
+            
+            # Parse JSON fields
+            annotation['market_context'] = json.loads(annotation.get('market_context', '{}'))
+            annotation['model_features'] = json.loads(annotation.get('model_features', '{}'))
+            annotation['pivot_data'] = json.loads(annotation.get('pivot_data', '{}'))
+            
+            # Get OHLCV snapshots
+            cursor.execute("""
+                SELECT timeframe, timestamp, open, high, low, close, volume
+                FROM annotation_ohlcv_snapshots
+                WHERE annotation_id = ?
+                ORDER BY timeframe, timestamp
+            """, (annotation_id,))
+            
+            snapshots = {}
+            for row in cursor.fetchall():
+                timeframe = row[0]
+                if timeframe not in snapshots:
+                    snapshots[timeframe] = []
+                
+                snapshots[timeframe].append({
+                    'timestamp': row[1],
+                    'open': row[2],
+                    'high': row[3],
+                    'low': row[4],
+                    'close': row[5],
+                    'volume': row[6]
+                })
+            
+            # Convert to DataFrames
+            annotation['ohlcv_snapshots'] = {}
+            for timeframe, data in snapshots.items():
+                df = pd.DataFrame(data)
+                df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
+                df = df.set_index('timestamp')
+                annotation['ohlcv_snapshots'][timeframe] = df
+            
+            # Get model predictions
+            cursor.execute("""
+                SELECT model_name, timestamp, prediction, confidence, features
+                FROM annotation_model_predictions
+                WHERE annotation_id = ?
+            """, (annotation_id,))
+            
+            predictions = []
+            for row in cursor.fetchall():
+                predictions.append({
+                    'model_name': row[0],
+                    'timestamp': row[1],
+                    'prediction': json.loads(row[2]),
+                    'confidence': row[3],
+                    'features': json.loads(row[4])
+                })
+            
+            annotation['model_predictions'] = predictions
+            
+            conn.close()
+            
+            logger.info(f"Retrieved annotation {annotation_id} with {len(snapshots)} timeframes from SQLite")
+            return annotation
+            
+        except Exception as e:
+            logger.error(f"Error retrieving annotation from SQLite: {e}")
+            return None
+    
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics"""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            # Get OHLCV stats
+            cursor.execute("""
+                SELECT symbol, timeframe, candle_count, first_timestamp, last_timestamp
+                FROM cache_metadata
+                ORDER BY symbol, timeframe
+            """)
+            
+            ohlcv_stats = []
+            for row in cursor.fetchall():
+                ohlcv_stats.append({
+                    'symbol': row[0],
+                    'timeframe': row[1],
+                    'candle_count': row[2],
+                    'first_timestamp': datetime.fromtimestamp(row[3] / 1000),
+                    'last_timestamp': datetime.fromtimestamp(row[4] / 1000)
+                })
+            
+            # Get annotation count
+            cursor.execute("SELECT COUNT(*) FROM annotations")
+            annotation_count = cursor.fetchone()[0]
+            
+            # Get total OHLCV rows
+            cursor.execute("SELECT COUNT(*) FROM ohlcv_data")
+            total_candles = cursor.fetchone()[0]
+            
+            conn.close()
+            
+            return {
+                'ohlcv_stats': ohlcv_stats,
+                'annotation_count': annotation_count,
+                'total_candles': total_candles
+            }
+            
+        except Exception as e:
+            logger.error(f"Error getting cache stats: {e}")
+            return {}