wip duckdb

2025-10-24 16:19:42 +03:00
parent 809c32e7a8
commit 4b8f44d859
2 changed files with 200 additions and 180 deletions
--- a/core/duckdb_storage.py
+++ b/core/duckdb_storage.py
@@ -30,8 +30,8 @@ class DuckDBStorage:
        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        
-        # Parquet storage directory
-        self.parquet_dir = self.db_path.parent / "parquet_store"
+        # Parquet storage directory (only for annotation snapshots)
+        self.parquet_dir = self.db_path.parent / "annotation_snapshots"
        self.parquet_dir.mkdir(parents=True, exist_ok=True)
        
        # Connect to DuckDB
@@ -41,12 +41,42 @@ class DuckDBStorage:
        self._init_schema()
        
        logger.info(f"DuckDB storage initialized: {self.db_path}")
-        logger.info(f"Parquet storage: {self.parquet_dir}")
+        logger.info(f"Annotation snapshots: {self.parquet_dir}")
    
    def _init_schema(self):
-        """Initialize database schema with Parquet integration"""
+        """Initialize database schema - all data in DuckDB tables"""
        
-        # Create annotations table (metadata only)
+        # Create OHLCV data table - stores ALL candles
+        self.conn.execute("""
+            CREATE SEQUENCE IF NOT EXISTS ohlcv_id_seq START 1
+        """)
+        self.conn.execute("""
+            CREATE TABLE IF NOT EXISTS ohlcv_data (
+                id INTEGER PRIMARY KEY DEFAULT nextval('ohlcv_id_seq'),
+                symbol VARCHAR NOT NULL,
+                timeframe VARCHAR NOT NULL,
+                timestamp BIGINT NOT NULL,
+                open DOUBLE NOT NULL,
+                high DOUBLE NOT NULL,
+                low DOUBLE NOT NULL,
+                close DOUBLE NOT NULL,
+                volume DOUBLE NOT NULL,
+                created_at BIGINT NOT NULL,
+                UNIQUE(symbol, timeframe, timestamp)
+            )
+        """)
+        
+        # Create indexes for fast queries
+        self.conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe 
+            ON ohlcv_data(symbol, timeframe)
+        """)
+        self.conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp 
+            ON ohlcv_data(timestamp)
+        """)
+        
+        # Create annotations table
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS annotations (
                annotation_id VARCHAR PRIMARY KEY,
@@ -72,7 +102,7 @@ class DuckDBStorage:
            CREATE TABLE IF NOT EXISTS cache_metadata (
                symbol VARCHAR NOT NULL,
                timeframe VARCHAR NOT NULL,
-                parquet_path VARCHAR NOT NULL,
+                parquet_path VARCHAR,
                first_timestamp BIGINT NOT NULL,
                last_timestamp BIGINT NOT NULL,
                candle_count INTEGER NOT NULL,
@@ -81,11 +111,11 @@ class DuckDBStorage:
            )
        """)
        
-        logger.info("DuckDB schema initialized")
+        logger.info("DuckDB schema initialized (all data in tables)")
    
    def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
        """
-        Store OHLCV data as Parquet file and register in DuckDB
+        Store OHLCV data directly in DuckDB table
        
        Args:
            symbol: Trading symbol
@@ -113,38 +143,41 @@ class DuckDBStorage:
            # Add metadata
            df_copy['symbol'] = symbol
            df_copy['timeframe'] = timeframe
+            df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
            
-            # Define parquet file path
-            parquet_file = self.parquet_dir / f"{symbol.replace('/', '_')}_{timeframe}.parquet"
+            # Select columns in correct order
+            columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
+            df_insert = df_copy[columns]
            
-            # Load existing data if file exists
-            if parquet_file.exists():
-                try:
-                    existing_df = pd.read_parquet(parquet_file)
-                    # Combine with new data
-                    df_copy = pd.concat([existing_df, df_copy], ignore_index=True)
-                    # Remove duplicates
-                    df_copy = df_copy.drop_duplicates(subset=['timestamp'], keep='last')
-                    df_copy = df_copy.sort_values('timestamp')
-                except Exception as e:
-                    logger.warning(f"Could not load existing parquet: {e}")
+            # Insert data directly into DuckDB (ignore duplicates)
+            # Note: id column is auto-generated, so we don't include it
+            self.conn.execute("""
+                INSERT INTO ohlcv_data (symbol, timeframe, timestamp, open, high, low, close, volume, created_at)
+                SELECT symbol, timeframe, timestamp, open, high, low, close, volume, created_at 
+                FROM df_insert
+                ON CONFLICT DO NOTHING
+            """)
            
-            # Save to parquet
-            df_copy.to_parquet(parquet_file, index=False, compression='snappy')
+            # Update metadata
+            result = self.conn.execute("""
+                SELECT 
+                    MIN(timestamp) as first_ts,
+                    MAX(timestamp) as last_ts,
+                    COUNT(*) as count
+                FROM ohlcv_data
+                WHERE symbol = ? AND timeframe = ?
+            """, (symbol, timeframe)).fetchone()
            
-            # Update metadata in DuckDB
-            first_ts = int(df_copy['timestamp'].min())
-            last_ts = int(df_copy['timestamp'].max())
-            count = len(df_copy)
+            first_ts, last_ts, count = result
            now_ts = int(datetime.now().timestamp() * 1000)
            
            self.conn.execute("""
                INSERT OR REPLACE INTO cache_metadata 
                (symbol, timeframe, parquet_path, first_timestamp, last_timestamp, candle_count, last_update)
                VALUES (?, ?, ?, ?, ?, ?, ?)
-            """, (symbol, timeframe, str(parquet_file), first_ts, last_ts, count, now_ts))
+            """, (symbol, timeframe, '', first_ts, last_ts, count, now_ts))
            
-            logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in Parquet (total: {count})")
+            logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in DuckDB (total: {count})")
            return len(df)
            
        except Exception as e:
@@ -158,7 +191,7 @@ class DuckDBStorage:
                       end_time: Optional[datetime] = None,
                       limit: Optional[int] = None) -> Optional[pd.DataFrame]:
        """
-        Query OHLCV data directly from Parquet using DuckDB
+        Query OHLCV data directly from DuckDB table
        
        Args:
            symbol: Trading symbol
@@ -171,26 +204,10 @@ class DuckDBStorage:
            DataFrame with OHLCV data
        """
        try:
-            # Get parquet file path from metadata
-            result = self.conn.execute("""
-                SELECT parquet_path FROM cache_metadata
-                WHERE symbol = ? AND timeframe = ?
-            """, (symbol, timeframe)).fetchone()
-            
-            if not result:
-                logger.debug(f"No data found for {symbol} {timeframe}")
-                return None
-            
-            parquet_path = result[0]
-            
-            if not Path(parquet_path).exists():
-                logger.warning(f"Parquet file not found: {parquet_path}")
-                return None
-            
-            # Build query - DuckDB can query Parquet directly!
-            query = f"""
+            # Build query
+            query = """
                SELECT timestamp, open, high, low, close, volume
-                FROM read_parquet('{parquet_path}')
+                FROM ohlcv_data
                WHERE symbol = ? AND timeframe = ?
            """
            params = [symbol, timeframe]
@@ -219,7 +236,7 @@ class DuckDBStorage:
            df = df.set_index('timestamp')
            df = df.sort_index()
            
-            logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from Parquet")
+            logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from DuckDB")
            return df
            
        except Exception as e: