WIP oclcv in storage. migrate do duckdb

2025-10-24 15:57:33 +03:00
parent d4ed894a92
commit 809c32e7a8
7 changed files with 1417 additions and 92 deletions
--- a/core/data_provider.py
+++ b/core/data_provider.py
@@ -67,6 +67,14 @@ except ImportError:
    UNIFIED_STORAGE_AVAILABLE = False
    logger.warning("Unified storage components not available")

+# Import DuckDB storage
+try:
+    from .duckdb_storage import DuckDBStorage
+    DUCKDB_STORAGE_AVAILABLE = True
+except ImportError:
+    DUCKDB_STORAGE_AVAILABLE = False
+    logger.warning("DuckDB storage not available")
+
@dataclass
 class PivotBounds:
    """Pivot-based normalization bounds derived from Williams Market Structure"""
@@ -142,15 +150,10 @@ class DataProvider:
    def __init__(self, symbols: List[str] = None, timeframes: List[str] = None):
        """Initialize the data provider"""
        self.config = get_config()
-        # Fixed symbols and timeframes for caching
+        # Fixed symbols and timeframes
        self.symbols = ['ETH/USDT', 'BTC/USDT']
        self.timeframes = ['1s', '1m', '1h', '1d']
        
-        # Cache settings (initialize first)
-        self.cache_enabled = True
-        self.cache_dir = Path('cache')
-        self.cache_dir.mkdir(parents=True, exist_ok=True)
-        
        # Data storage - cached OHLCV data (1500 candles each)
        self.cached_data = {}  # {symbol: {timeframe: DataFrame}}
        self.real_time_data = {}   # {symbol: {timeframe: deque}}
@@ -176,11 +179,7 @@ class DataProvider:
        
        # Pivot-based normalization system
        self.pivot_bounds: Dict[str, PivotBounds] = {}  # {symbol: PivotBounds}
-        self.pivot_cache_dir = self.cache_dir / 'pivot_bounds'
-        self.pivot_cache_dir.mkdir(parents=True, exist_ok=True)
        self.pivot_refresh_interval = timedelta(days=1)  # Refresh pivot bounds daily
-        self.monthly_data_cache_dir = self.cache_dir / 'monthly_1s_data'
-        self.monthly_data_cache_dir.mkdir(parents=True, exist_ok=True)
        
        # Enhanced WebSocket integration
        self.enhanced_cob_websocket: Optional[EnhancedCOBWebSocket] = None
@@ -266,11 +265,16 @@ class DataProvider:
        self.unified_storage: Optional['UnifiedDataProviderExtension'] = None
        self._unified_storage_enabled = False
        
-        # Auto-fix corrupted cache files on startup
-        self._auto_fix_corrupted_cache()
+        # DuckDB storage - unified storage with native Parquet support
+        self.duckdb_storage: Optional[DuckDBStorage] = None
+        if DUCKDB_STORAGE_AVAILABLE:
+            try:
+                self.duckdb_storage = DuckDBStorage()
+                logger.info("✅ DuckDB storage initialized (unified Parquet + SQL)")
+            except Exception as e:
+                logger.warning(f"Could not initialize DuckDB storage: {e}")
        
-        # Load existing pivot bounds from cache
-        self._load_all_pivot_bounds()
+        # Pivot bounds will be calculated on demand
        
        # COB (Consolidated Order Book) data system using WebSocket
        self.cob_integration: Optional[COBIntegration] = None
@@ -1488,11 +1492,18 @@ class DataProvider:
            logger.error(f"Error getting market state at time: {e}")
            return {}

-    def get_historical_data(self, symbol: str, timeframe: str, limit: int = 1000, refresh: bool = False) -> Optional[pd.DataFrame]:
+    def get_historical_data(self, symbol: str, timeframe: str, limit: int = 1000, refresh: bool = False, allow_stale_cache: bool = False) -> Optional[pd.DataFrame]:
        """Get historical OHLCV data.
        - Prefer cached data for low latency.
        - If cache is empty or refresh=True, fetch real data from exchanges.
        - Never generate synthetic data.
+        
+        Args:
+            symbol: Trading symbol
+            timeframe: Timeframe
+            limit: Number of candles to return
+            refresh: Force refresh from exchange
+            allow_stale_cache: Allow loading stale cache (for startup performance)
        """
        try:
            # Serve from cache when available
@@ -1501,6 +1512,17 @@ class DataProvider:
                if not cached_df.empty and not refresh:
                    return cached_df.tail(limit)

+            # Try loading from DuckDB first (fast Parquet queries)
+            if allow_stale_cache:
+                cached_df = self._load_from_duckdb(symbol, timeframe, limit=1500)
+                if cached_df is not None and not cached_df.empty:
+                    logger.info(f"Loaded {len(cached_df)} candles from DuckDB for {symbol} {timeframe} (startup mode)")
+                    # Store in memory cache
+                    if symbol not in self.cached_data:
+                        self.cached_data[symbol] = {}
+                    self.cached_data[symbol][timeframe] = cached_df.tail(1500)
+                    return cached_df.tail(limit)
+
            # Cache empty or refresh requested: fetch real data now
            df = self._fetch_from_binance(symbol, timeframe, limit)
            if (df is None or df.empty):
@@ -1508,7 +1530,15 @@ class DataProvider:

            if df is not None and not df.empty:
                df = self._ensure_datetime_index(df)
-                # Store/merge into cache
+                
+                # Store in DuckDB (Parquet + SQL in one)
+                if self.duckdb_storage:
+                    try:
+                        self.duckdb_storage.store_ohlcv_data(symbol, timeframe, df)
+                    except Exception as e:
+                        logger.warning(f"Could not store data in DuckDB: {e}")
+                
+                # Store/merge into memory cache (keep last 1500 candles for fast access)
                if symbol not in self.cached_data:
                    self.cached_data[symbol] = {}
                if timeframe not in self.cached_data[symbol] or self.cached_data[symbol][timeframe].empty:
@@ -1518,7 +1548,8 @@ class DataProvider:
                    combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
                    combined_df = combined_df.sort_index()
                    self.cached_data[symbol][timeframe] = combined_df.tail(1500)
-                logger.info(f"Cached {len(self.cached_data[symbol][timeframe])} candles for {symbol} {timeframe}")
+                
+                logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} (DuckDB + memory cache)")
                return self.cached_data[symbol][timeframe].tail(limit)

            logger.warning(f"No real data available for {symbol} {timeframe} at request time")
@@ -2973,71 +3004,33 @@ class DataProvider:
            logger.debug(f"Error calculating RSI: {e}")
            return 50.0  # Default neutral value
    
-    def _load_from_cache(self, symbol: str, timeframe: str) -> Optional[pd.DataFrame]:
-        """Load data from cache"""
-        try:
-            cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
-            if cache_file.exists():
-                # Check if cache is recent - stricter rules for startup
-                cache_age = time.time() - cache_file.stat().st_mtime
-                
-                # For 1m data, use cache only if less than 5 minutes old to avoid gaps
-                if timeframe == '1m':
-                    max_age = 300  # 5 minutes
-                else:
-                    max_age = 3600  # 1 hour for other timeframes
-                
-                if cache_age < max_age:
-                    try:
-                        df = pd.read_parquet(cache_file)
-                        # Ensure cached data has proper timezone (UTC to match COB WebSocket data)
-                        if not df.empty and 'timestamp' in df.columns:
-                            if df['timestamp'].dt.tz is None:
-                                # If no timezone info, assume UTC and keep in UTC
-                                df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
-                            elif str(df['timestamp'].dt.tz) != 'UTC':
-                                # Convert to UTC if different timezone
-                                df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')
-                        logger.debug(f"Loaded {len(df)} rows from cache for {symbol} {timeframe} (age: {cache_age/60:.1f}min)")
-                        return df
-                    except Exception as parquet_e:
-                        # Handle corrupted Parquet file - expanded error detection
-                        error_str = str(parquet_e).lower()
-                        corrupted_indicators = [
-                            "parquet magic bytes not found",
-                            "corrupted",
-                            "couldn't deserialize thrift",
-                            "don't know what type",
-                            "invalid parquet file",
-                            "unexpected end of file",
-                            "invalid metadata"
-                        ]
-                        
-                        if any(indicator in error_str for indicator in corrupted_indicators):
-                            logger.warning(f"Corrupted Parquet cache file for {symbol} {timeframe}, removing and returning None: {parquet_e}")
-                            try:
-                                cache_file.unlink()  # Delete corrupted file
-                                logger.info(f"Deleted corrupted cache file: {cache_file}")
-                            except Exception as delete_e:
-                                logger.error(f"Failed to delete corrupted cache file: {delete_e}")
-                            return None
-                        else:
-                            raise parquet_e
-                else:
-                    logger.debug(f"Cache for {symbol} {timeframe} is too old ({cache_age/60:.1f}min > {max_age/60:.1f}min)")
+    def _load_from_duckdb(self, symbol: str, timeframe: str, limit: int = 1500) -> Optional[pd.DataFrame]:
+        """Load data from DuckDB storage
+        
+        Args:
+            symbol: Trading symbol
+            timeframe: Timeframe
+            limit: Number of candles to load
+        """
+        if not self.duckdb_storage:
            return None
-        except Exception as e:
-            logger.warning(f"Error loading cache for {symbol} {timeframe}: {e}")
-            return None
-    
-    def _save_to_cache(self, df: pd.DataFrame, symbol: str, timeframe: str):
-        """Save data to cache"""
+        
        try:
-            cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
-            df.to_parquet(cache_file, index=False)
-            logger.debug(f"Saved {len(df)} rows to cache for {symbol} {timeframe}")
+            df = self.duckdb_storage.get_ohlcv_data(
+                symbol=symbol,
+                timeframe=timeframe,
+                limit=limit
+            )
+            
+            if df is not None and not df.empty:
+                logger.debug(f"Loaded {len(df)} candles from DuckDB for {symbol} {timeframe}")
+                return df
+            
+            return None
+            
        except Exception as e:
-            logger.warning(f"Error saving cache for {symbol} {timeframe}: {e}")
+            logger.warning(f"Error loading from DuckDB for {symbol} {timeframe}: {e}")
+            return None
    
    async def start_real_time_streaming(self):
        """Start real-time data streaming using COBIntegration"""