WIP oclcv in storage. migrate do duckdb
This commit is contained in:
@@ -67,6 +67,14 @@ except ImportError:
|
||||
UNIFIED_STORAGE_AVAILABLE = False
|
||||
logger.warning("Unified storage components not available")
|
||||
|
||||
# Import DuckDB storage
|
||||
try:
|
||||
from .duckdb_storage import DuckDBStorage
|
||||
DUCKDB_STORAGE_AVAILABLE = True
|
||||
except ImportError:
|
||||
DUCKDB_STORAGE_AVAILABLE = False
|
||||
logger.warning("DuckDB storage not available")
|
||||
|
||||
@dataclass
|
||||
class PivotBounds:
|
||||
"""Pivot-based normalization bounds derived from Williams Market Structure"""
|
||||
@@ -142,15 +150,10 @@ class DataProvider:
|
||||
def __init__(self, symbols: List[str] = None, timeframes: List[str] = None):
|
||||
"""Initialize the data provider"""
|
||||
self.config = get_config()
|
||||
# Fixed symbols and timeframes for caching
|
||||
# Fixed symbols and timeframes
|
||||
self.symbols = ['ETH/USDT', 'BTC/USDT']
|
||||
self.timeframes = ['1s', '1m', '1h', '1d']
|
||||
|
||||
# Cache settings (initialize first)
|
||||
self.cache_enabled = True
|
||||
self.cache_dir = Path('cache')
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Data storage - cached OHLCV data (1500 candles each)
|
||||
self.cached_data = {} # {symbol: {timeframe: DataFrame}}
|
||||
self.real_time_data = {} # {symbol: {timeframe: deque}}
|
||||
@@ -176,11 +179,7 @@ class DataProvider:
|
||||
|
||||
# Pivot-based normalization system
|
||||
self.pivot_bounds: Dict[str, PivotBounds] = {} # {symbol: PivotBounds}
|
||||
self.pivot_cache_dir = self.cache_dir / 'pivot_bounds'
|
||||
self.pivot_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.pivot_refresh_interval = timedelta(days=1) # Refresh pivot bounds daily
|
||||
self.monthly_data_cache_dir = self.cache_dir / 'monthly_1s_data'
|
||||
self.monthly_data_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Enhanced WebSocket integration
|
||||
self.enhanced_cob_websocket: Optional[EnhancedCOBWebSocket] = None
|
||||
@@ -266,11 +265,16 @@ class DataProvider:
|
||||
self.unified_storage: Optional['UnifiedDataProviderExtension'] = None
|
||||
self._unified_storage_enabled = False
|
||||
|
||||
# Auto-fix corrupted cache files on startup
|
||||
self._auto_fix_corrupted_cache()
|
||||
# DuckDB storage - unified storage with native Parquet support
|
||||
self.duckdb_storage: Optional[DuckDBStorage] = None
|
||||
if DUCKDB_STORAGE_AVAILABLE:
|
||||
try:
|
||||
self.duckdb_storage = DuckDBStorage()
|
||||
logger.info("✅ DuckDB storage initialized (unified Parquet + SQL)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not initialize DuckDB storage: {e}")
|
||||
|
||||
# Load existing pivot bounds from cache
|
||||
self._load_all_pivot_bounds()
|
||||
# Pivot bounds will be calculated on demand
|
||||
|
||||
# COB (Consolidated Order Book) data system using WebSocket
|
||||
self.cob_integration: Optional[COBIntegration] = None
|
||||
@@ -1488,11 +1492,18 @@ class DataProvider:
|
||||
logger.error(f"Error getting market state at time: {e}")
|
||||
return {}
|
||||
|
||||
def get_historical_data(self, symbol: str, timeframe: str, limit: int = 1000, refresh: bool = False) -> Optional[pd.DataFrame]:
|
||||
def get_historical_data(self, symbol: str, timeframe: str, limit: int = 1000, refresh: bool = False, allow_stale_cache: bool = False) -> Optional[pd.DataFrame]:
|
||||
"""Get historical OHLCV data.
|
||||
- Prefer cached data for low latency.
|
||||
- If cache is empty or refresh=True, fetch real data from exchanges.
|
||||
- Never generate synthetic data.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe
|
||||
limit: Number of candles to return
|
||||
refresh: Force refresh from exchange
|
||||
allow_stale_cache: Allow loading stale cache (for startup performance)
|
||||
"""
|
||||
try:
|
||||
# Serve from cache when available
|
||||
@@ -1501,6 +1512,17 @@ class DataProvider:
|
||||
if not cached_df.empty and not refresh:
|
||||
return cached_df.tail(limit)
|
||||
|
||||
# Try loading from DuckDB first (fast Parquet queries)
|
||||
if allow_stale_cache:
|
||||
cached_df = self._load_from_duckdb(symbol, timeframe, limit=1500)
|
||||
if cached_df is not None and not cached_df.empty:
|
||||
logger.info(f"Loaded {len(cached_df)} candles from DuckDB for {symbol} {timeframe} (startup mode)")
|
||||
# Store in memory cache
|
||||
if symbol not in self.cached_data:
|
||||
self.cached_data[symbol] = {}
|
||||
self.cached_data[symbol][timeframe] = cached_df.tail(1500)
|
||||
return cached_df.tail(limit)
|
||||
|
||||
# Cache empty or refresh requested: fetch real data now
|
||||
df = self._fetch_from_binance(symbol, timeframe, limit)
|
||||
if (df is None or df.empty):
|
||||
@@ -1508,7 +1530,15 @@ class DataProvider:
|
||||
|
||||
if df is not None and not df.empty:
|
||||
df = self._ensure_datetime_index(df)
|
||||
# Store/merge into cache
|
||||
|
||||
# Store in DuckDB (Parquet + SQL in one)
|
||||
if self.duckdb_storage:
|
||||
try:
|
||||
self.duckdb_storage.store_ohlcv_data(symbol, timeframe, df)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not store data in DuckDB: {e}")
|
||||
|
||||
# Store/merge into memory cache (keep last 1500 candles for fast access)
|
||||
if symbol not in self.cached_data:
|
||||
self.cached_data[symbol] = {}
|
||||
if timeframe not in self.cached_data[symbol] or self.cached_data[symbol][timeframe].empty:
|
||||
@@ -1518,7 +1548,8 @@ class DataProvider:
|
||||
combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
|
||||
combined_df = combined_df.sort_index()
|
||||
self.cached_data[symbol][timeframe] = combined_df.tail(1500)
|
||||
logger.info(f"Cached {len(self.cached_data[symbol][timeframe])} candles for {symbol} {timeframe}")
|
||||
|
||||
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} (DuckDB + memory cache)")
|
||||
return self.cached_data[symbol][timeframe].tail(limit)
|
||||
|
||||
logger.warning(f"No real data available for {symbol} {timeframe} at request time")
|
||||
@@ -2973,71 +3004,33 @@ class DataProvider:
|
||||
logger.debug(f"Error calculating RSI: {e}")
|
||||
return 50.0 # Default neutral value
|
||||
|
||||
def _load_from_cache(self, symbol: str, timeframe: str) -> Optional[pd.DataFrame]:
|
||||
"""Load data from cache"""
|
||||
try:
|
||||
cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
|
||||
if cache_file.exists():
|
||||
# Check if cache is recent - stricter rules for startup
|
||||
cache_age = time.time() - cache_file.stat().st_mtime
|
||||
|
||||
# For 1m data, use cache only if less than 5 minutes old to avoid gaps
|
||||
if timeframe == '1m':
|
||||
max_age = 300 # 5 minutes
|
||||
else:
|
||||
max_age = 3600 # 1 hour for other timeframes
|
||||
|
||||
if cache_age < max_age:
|
||||
try:
|
||||
df = pd.read_parquet(cache_file)
|
||||
# Ensure cached data has proper timezone (UTC to match COB WebSocket data)
|
||||
if not df.empty and 'timestamp' in df.columns:
|
||||
if df['timestamp'].dt.tz is None:
|
||||
# If no timezone info, assume UTC and keep in UTC
|
||||
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
|
||||
elif str(df['timestamp'].dt.tz) != 'UTC':
|
||||
# Convert to UTC if different timezone
|
||||
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')
|
||||
logger.debug(f"Loaded {len(df)} rows from cache for {symbol} {timeframe} (age: {cache_age/60:.1f}min)")
|
||||
return df
|
||||
except Exception as parquet_e:
|
||||
# Handle corrupted Parquet file - expanded error detection
|
||||
error_str = str(parquet_e).lower()
|
||||
corrupted_indicators = [
|
||||
"parquet magic bytes not found",
|
||||
"corrupted",
|
||||
"couldn't deserialize thrift",
|
||||
"don't know what type",
|
||||
"invalid parquet file",
|
||||
"unexpected end of file",
|
||||
"invalid metadata"
|
||||
]
|
||||
|
||||
if any(indicator in error_str for indicator in corrupted_indicators):
|
||||
logger.warning(f"Corrupted Parquet cache file for {symbol} {timeframe}, removing and returning None: {parquet_e}")
|
||||
try:
|
||||
cache_file.unlink() # Delete corrupted file
|
||||
logger.info(f"Deleted corrupted cache file: {cache_file}")
|
||||
except Exception as delete_e:
|
||||
logger.error(f"Failed to delete corrupted cache file: {delete_e}")
|
||||
return None
|
||||
else:
|
||||
raise parquet_e
|
||||
else:
|
||||
logger.debug(f"Cache for {symbol} {timeframe} is too old ({cache_age/60:.1f}min > {max_age/60:.1f}min)")
|
||||
def _load_from_duckdb(self, symbol: str, timeframe: str, limit: int = 1500) -> Optional[pd.DataFrame]:
|
||||
"""Load data from DuckDB storage
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe
|
||||
limit: Number of candles to load
|
||||
"""
|
||||
if not self.duckdb_storage:
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning(f"Error loading cache for {symbol} {timeframe}: {e}")
|
||||
return None
|
||||
|
||||
def _save_to_cache(self, df: pd.DataFrame, symbol: str, timeframe: str):
|
||||
"""Save data to cache"""
|
||||
|
||||
try:
|
||||
cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
|
||||
df.to_parquet(cache_file, index=False)
|
||||
logger.debug(f"Saved {len(df)} rows to cache for {symbol} {timeframe}")
|
||||
df = self.duckdb_storage.get_ohlcv_data(
|
||||
symbol=symbol,
|
||||
timeframe=timeframe,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
if df is not None and not df.empty:
|
||||
logger.debug(f"Loaded {len(df)} candles from DuckDB for {symbol} {timeframe}")
|
||||
return df
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error saving cache for {symbol} {timeframe}: {e}")
|
||||
logger.warning(f"Error loading from DuckDB for {symbol} {timeframe}: {e}")
|
||||
return None
|
||||
|
||||
async def start_real_time_streaming(self):
|
||||
"""Start real-time data streaming using COBIntegration"""
|
||||
|
||||
Reference in New Issue
Block a user