wip duckdb

This commit is contained in:
Dobromir Popov
2025-10-24 16:19:42 +03:00
parent 809c32e7a8
commit 4b8f44d859
2 changed files with 200 additions and 180 deletions

View File

@@ -352,6 +352,9 @@ class DataProvider:
# Start COB WebSocket integration # Start COB WebSocket integration
self.start_cob_websocket_integration() self.start_cob_websocket_integration()
# Start periodic DuckDB storage of live data
self._start_periodic_duckdb_storage()
# =================================================================== # ===================================================================
# UNIFIED STORAGE SYSTEM METHODS # UNIFIED STORAGE SYSTEM METHODS
# =================================================================== # ===================================================================
@@ -700,7 +703,7 @@ class DataProvider:
self.catch_up_completed = True self.catch_up_completed = True
def _update_cached_data(self, symbol: str, timeframe: str): def _update_cached_data(self, symbol: str, timeframe: str):
"""Update cached data by fetching last 2 candles with thread-safe locking""" """Update cached data by fetching last 2 candles and storing to DuckDB"""
try: try:
# Fetch last 2 candles (outside lock - network I/O) # Fetch last 2 candles (outside lock - network I/O)
df = self._fetch_from_binance(symbol, timeframe, 2) df = self._fetch_from_binance(symbol, timeframe, 2)
@@ -712,6 +715,14 @@ class DataProvider:
# Ensure proper datetime index # Ensure proper datetime index
df = self._ensure_datetime_index(df) df = self._ensure_datetime_index(df)
# Store to DuckDB immediately (live data persistence)
if self.duckdb_storage:
try:
self.duckdb_storage.store_ohlcv_data(symbol, timeframe, df)
logger.debug(f"Stored live data to DuckDB: {symbol} {timeframe} ({len(df)} candles)")
except Exception as e:
logger.warning(f"Could not store live data to DuckDB: {e}")
# Update cached data with lock # Update cached data with lock
with self.data_lock: with self.data_lock:
existing_df = self.cached_data[symbol][timeframe] existing_df = self.cached_data[symbol][timeframe]
@@ -722,7 +733,7 @@ class DataProvider:
combined_df = combined_df[~combined_df.index.duplicated(keep='last')] combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
combined_df = combined_df.sort_index() combined_df = combined_df.sort_index()
# Keep only last 1500 candles # Keep only last 1500 candles in memory
self.cached_data[symbol][timeframe] = combined_df.tail(1500) self.cached_data[symbol][timeframe] = combined_df.tail(1500)
else: else:
self.cached_data[symbol][timeframe] = df self.cached_data[symbol][timeframe] = df
@@ -2037,10 +2048,22 @@ class DataProvider:
# === WILLIAMS MARKET STRUCTURE PIVOT SYSTEM === # === WILLIAMS MARKET STRUCTURE PIVOT SYSTEM ===
def _collect_monthly_1m_data(self, symbol: str) -> Optional[pd.DataFrame]: def _collect_monthly_1m_data(self, symbol: str) -> Optional[pd.DataFrame]:
"""Collect 30 days of 1m candles with smart gap-filling cache system""" """Collect 30 days of 1m candles with DuckDB cache system"""
try: try:
# Check for cached data and determine what we need to fetch # Check for cached data in DuckDB
cached_data = self._load_monthly_data_from_cache(symbol) cached_data = None
if self.duckdb_storage:
try:
end_time_check = datetime.utcnow()
start_time_check = end_time_check - timedelta(days=30)
cached_data = self.duckdb_storage.get_ohlcv_data(
symbol=symbol,
timeframe='1m',
start_time=start_time_check,
end_time=end_time_check
)
except Exception as e:
logger.debug(f"No cached monthly data in DuckDB: {e}")
import pytz import pytz
utc = pytz.UTC utc = pytz.UTC
@@ -2097,8 +2120,12 @@ class DataProvider:
logger.info(f"Final dataset: {len(monthly_df)} 1m candles for {symbol}") logger.info(f"Final dataset: {len(monthly_df)} 1m candles for {symbol}")
# Update cache # Update DuckDB cache
self._save_monthly_data_to_cache(symbol, monthly_df) if self.duckdb_storage:
try:
self.duckdb_storage.store_ohlcv_data(symbol, '1m', monthly_df)
except Exception as e:
logger.warning(f"Could not cache monthly data in DuckDB: {e}")
return monthly_df return monthly_df
else: else:
@@ -2473,8 +2500,37 @@ class DataProvider:
# Store bounds # Store bounds
self.pivot_bounds[symbol] = bounds self.pivot_bounds[symbol] = bounds
# Save to cache # Save to DuckDB as JSON
self._save_pivot_bounds_to_cache(symbol, bounds) if self.duckdb_storage:
try:
import pickle
bounds_json = {
'symbol': symbol,
'price_max': bounds.price_max,
'price_min': bounds.price_min,
'volume_max': bounds.volume_max,
'volume_min': bounds.volume_min,
'pivot_support_levels': bounds.pivot_support_levels,
'pivot_resistance_levels': bounds.pivot_resistance_levels,
'created_timestamp': bounds.created_timestamp.isoformat(),
'data_period_start': bounds.data_period_start.isoformat(),
'data_period_end': bounds.data_period_end.isoformat(),
'total_candles_analyzed': bounds.total_candles_analyzed
}
# Store in DuckDB metadata table
self.duckdb_storage.conn.execute("""
CREATE TABLE IF NOT EXISTS pivot_bounds_cache (
symbol VARCHAR PRIMARY KEY,
bounds_data JSON,
updated_at BIGINT
)
""")
self.duckdb_storage.conn.execute("""
INSERT OR REPLACE INTO pivot_bounds_cache (symbol, bounds_data, updated_at)
VALUES (?, ?, ?)
""", (symbol, json.dumps(bounds_json), int(datetime.now().timestamp() * 1000)))
except Exception as e:
logger.warning(f"Could not save pivot bounds to DuckDB: {e}")
logger.info(f"Successfully refreshed pivot bounds for {symbol}") logger.info(f"Successfully refreshed pivot bounds for {symbol}")
@@ -2530,136 +2586,83 @@ class DataProvider:
return symbol # Return first symbol for now - can be improved return symbol # Return first symbol for now - can be improved
return None return None
# === CACHE MANAGEMENT === # === DUCKDB CACHE MANAGEMENT ===
def _auto_fix_corrupted_cache(self): def _start_periodic_duckdb_storage(self):
"""Automatically fix corrupted cache files on startup""" """Start periodic task to store live data to DuckDB"""
def storage_worker():
"""Worker thread that periodically stores cached data to DuckDB"""
import time
logger.info("Started periodic DuckDB storage worker (every 60 seconds)")
while True:
try: try:
from utils.cache_manager import get_cache_manager time.sleep(60) # Store every minute
cache_manager = get_cache_manager()
# Quick health check if not self.duckdb_storage:
health_summary = cache_manager.get_cache_summary() continue
if health_summary['corrupted_files'] > 0: # Store all cached data to DuckDB
logger.warning(f"Found {health_summary['corrupted_files']} corrupted cache files, cleaning up...")
# Auto-cleanup corrupted files (no confirmation needed)
deleted_files = cache_manager.cleanup_corrupted_files(dry_run=False)
deleted_count = 0
for cache_dir, files in deleted_files.items():
for file_info in files:
if "DELETED:" in file_info:
deleted_count += 1
logger.info(f"Auto-cleaned {deleted_count} corrupted cache files")
else:
logger.debug("Cache health check passed - no corrupted files found")
except Exception as e:
logger.warning(f"Cache auto-fix failed: {e}")
# === PIVOT BOUNDS CACHING ===
def _load_all_pivot_bounds(self):
"""Load all cached pivot bounds on startup"""
try:
for symbol in self.symbols: for symbol in self.symbols:
bounds = self._load_pivot_bounds_from_cache(symbol) for timeframe in self.timeframes:
if bounds: with self.data_lock:
self.pivot_bounds[symbol] = bounds df = self.cached_data.get(symbol, {}).get(timeframe)
logger.info(f"Loaded cached pivot bounds for {symbol}")
except Exception as e:
logger.error(f"Error loading pivot bounds from cache: {e}")
def _load_pivot_bounds_from_cache(self, symbol: str) -> Optional[PivotBounds]: if df is not None and not df.empty:
"""Load pivot bounds from cache"""
try: try:
cache_file = self.pivot_cache_dir / f"{symbol.replace('/', '')}_pivot_bounds.pkl" # Store last 100 candles to avoid duplicates
if cache_file.exists(): self.duckdb_storage.store_ohlcv_data(symbol, timeframe, df.tail(100))
with open(cache_file, 'rb') as f: logger.debug(f"Periodic storage: {symbol} {timeframe} ({len(df.tail(100))} candles)")
bounds = pickle.load(f) except Exception as e:
logger.warning(f"Error in periodic storage for {symbol} {timeframe}: {e}")
# Check if bounds are still valid (not too old) except Exception as e:
logger.error(f"Error in periodic DuckDB storage worker: {e}")
# Start worker thread
storage_thread = Thread(target=storage_worker, daemon=True, name="DuckDBStorageWorker")
storage_thread.start()
logger.info("Periodic DuckDB storage worker started")
def _load_pivot_bounds_from_duckdb(self, symbol: str) -> Optional[PivotBounds]:
"""Load pivot bounds from DuckDB"""
if not self.duckdb_storage:
return None
try:
result = self.duckdb_storage.conn.execute("""
SELECT bounds_data FROM pivot_bounds_cache WHERE symbol = ?
""", (symbol,)).fetchone()
if result:
bounds_json = json.loads(result[0])
bounds = PivotBounds(
symbol=bounds_json['symbol'],
price_max=bounds_json['price_max'],
price_min=bounds_json['price_min'],
volume_max=bounds_json['volume_max'],
volume_min=bounds_json['volume_min'],
pivot_support_levels=bounds_json['pivot_support_levels'],
pivot_resistance_levels=bounds_json['pivot_resistance_levels'],
pivot_context={},
created_timestamp=datetime.fromisoformat(bounds_json['created_timestamp']),
data_period_start=datetime.fromisoformat(bounds_json['data_period_start']),
data_period_end=datetime.fromisoformat(bounds_json['data_period_end']),
total_candles_analyzed=bounds_json['total_candles_analyzed']
)
# Check if cache is recent
age = datetime.now() - bounds.created_timestamp age = datetime.now() - bounds.created_timestamp
if age <= self.pivot_refresh_interval: if age < self.pivot_refresh_interval:
logger.debug(f"Loaded pivot bounds from DuckDB for {symbol}")
return bounds return bounds
else:
logger.info(f"Cached pivot bounds for {symbol} are too old ({age.days} days)")
return None
except Exception as e: except Exception as e:
logger.warning(f"Error loading pivot bounds from cache for {symbol}: {e}") logger.debug(f"Could not load pivot bounds from DuckDB for {symbol}: {e}")
return None
def _save_pivot_bounds_to_cache(self, symbol: str, bounds: PivotBounds):
"""Save pivot bounds to cache"""
try:
cache_file = self.pivot_cache_dir / f"{symbol.replace('/', '')}_pivot_bounds.pkl"
with open(cache_file, 'wb') as f:
pickle.dump(bounds, f)
logger.debug(f"Saved pivot bounds to cache for {symbol}")
except Exception as e:
logger.warning(f"Error saving pivot bounds to cache for {symbol}: {e}")
def _load_monthly_data_from_cache(self, symbol: str) -> Optional[pd.DataFrame]:
"""Load monthly 1m data from cache"""
try:
cache_file = self.monthly_data_cache_dir / f"{symbol.replace('/', '')}_monthly_1m.parquet"
if cache_file.exists():
try:
df = pd.read_parquet(cache_file)
# Ensure cached monthly data has proper timezone (UTC to match COB WebSocket data)
if not df.empty and 'timestamp' in df.columns:
if df['timestamp'].dt.tz is None:
# If no timezone info, assume UTC and keep in UTC
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
elif str(df['timestamp'].dt.tz) != 'UTC':
# Convert to UTC if different timezone
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')
logger.info(f"Loaded {len(df)} 1m candles from cache for {symbol}")
return df
except Exception as parquet_e:
# Handle corrupted Parquet file - expanded error detection
error_str = str(parquet_e).lower()
corrupted_indicators = [
"parquet magic bytes not found",
"corrupted",
"couldn't deserialize thrift",
"don't know what type",
"invalid parquet file",
"unexpected end of file",
"invalid metadata"
]
if any(indicator in error_str for indicator in corrupted_indicators):
logger.warning(f"Corrupted Parquet cache file for {symbol}, removing and returning None: {parquet_e}")
try:
cache_file.unlink() # Delete corrupted file
logger.info(f"Deleted corrupted monthly cache file: {cache_file}")
except Exception as delete_e:
logger.error(f"Failed to delete corrupted monthly cache file: {delete_e}")
return None
else:
raise parquet_e
return None return None
except Exception as e:
logger.warning(f"Error loading monthly data from cache for {symbol}: {e}")
return None
def _save_monthly_data_to_cache(self, symbol: str, df: pd.DataFrame):
"""Save monthly 1m data to cache"""
try:
cache_file = self.monthly_data_cache_dir / f"{symbol.replace('/', '')}_monthly_1m.parquet"
df.to_parquet(cache_file, index=False)
logger.info(f"Saved {len(df)} monthly 1m candles to cache for {symbol}")
except Exception as e:
logger.warning(f"Error saving monthly data to cache for {symbol}: {e}")
def get_pivot_bounds(self, symbol: str) -> Optional[PivotBounds]: def get_pivot_bounds(self, symbol: str) -> Optional[PivotBounds]:
"""Get pivot bounds for a symbol""" """Get pivot bounds for a symbol"""
return self.pivot_bounds.get(symbol) return self.pivot_bounds.get(symbol)

View File

@@ -30,8 +30,8 @@ class DuckDBStorage:
self.db_path = Path(db_path) self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True) self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Parquet storage directory # Parquet storage directory (only for annotation snapshots)
self.parquet_dir = self.db_path.parent / "parquet_store" self.parquet_dir = self.db_path.parent / "annotation_snapshots"
self.parquet_dir.mkdir(parents=True, exist_ok=True) self.parquet_dir.mkdir(parents=True, exist_ok=True)
# Connect to DuckDB # Connect to DuckDB
@@ -41,12 +41,42 @@ class DuckDBStorage:
self._init_schema() self._init_schema()
logger.info(f"DuckDB storage initialized: {self.db_path}") logger.info(f"DuckDB storage initialized: {self.db_path}")
logger.info(f"Parquet storage: {self.parquet_dir}") logger.info(f"Annotation snapshots: {self.parquet_dir}")
def _init_schema(self): def _init_schema(self):
"""Initialize database schema with Parquet integration""" """Initialize database schema - all data in DuckDB tables"""
# Create annotations table (metadata only) # Create OHLCV data table - stores ALL candles
self.conn.execute("""
CREATE SEQUENCE IF NOT EXISTS ohlcv_id_seq START 1
""")
self.conn.execute("""
CREATE TABLE IF NOT EXISTS ohlcv_data (
id INTEGER PRIMARY KEY DEFAULT nextval('ohlcv_id_seq'),
symbol VARCHAR NOT NULL,
timeframe VARCHAR NOT NULL,
timestamp BIGINT NOT NULL,
open DOUBLE NOT NULL,
high DOUBLE NOT NULL,
low DOUBLE NOT NULL,
close DOUBLE NOT NULL,
volume DOUBLE NOT NULL,
created_at BIGINT NOT NULL,
UNIQUE(symbol, timeframe, timestamp)
)
""")
# Create indexes for fast queries
self.conn.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
ON ohlcv_data(symbol, timeframe)
""")
self.conn.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
ON ohlcv_data(timestamp)
""")
# Create annotations table
self.conn.execute(""" self.conn.execute("""
CREATE TABLE IF NOT EXISTS annotations ( CREATE TABLE IF NOT EXISTS annotations (
annotation_id VARCHAR PRIMARY KEY, annotation_id VARCHAR PRIMARY KEY,
@@ -72,7 +102,7 @@ class DuckDBStorage:
CREATE TABLE IF NOT EXISTS cache_metadata ( CREATE TABLE IF NOT EXISTS cache_metadata (
symbol VARCHAR NOT NULL, symbol VARCHAR NOT NULL,
timeframe VARCHAR NOT NULL, timeframe VARCHAR NOT NULL,
parquet_path VARCHAR NOT NULL, parquet_path VARCHAR,
first_timestamp BIGINT NOT NULL, first_timestamp BIGINT NOT NULL,
last_timestamp BIGINT NOT NULL, last_timestamp BIGINT NOT NULL,
candle_count INTEGER NOT NULL, candle_count INTEGER NOT NULL,
@@ -81,11 +111,11 @@ class DuckDBStorage:
) )
""") """)
logger.info("DuckDB schema initialized") logger.info("DuckDB schema initialized (all data in tables)")
def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int: def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
""" """
Store OHLCV data as Parquet file and register in DuckDB Store OHLCV data directly in DuckDB table
Args: Args:
symbol: Trading symbol symbol: Trading symbol
@@ -113,38 +143,41 @@ class DuckDBStorage:
# Add metadata # Add metadata
df_copy['symbol'] = symbol df_copy['symbol'] = symbol
df_copy['timeframe'] = timeframe df_copy['timeframe'] = timeframe
df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
# Define parquet file path # Select columns in correct order
parquet_file = self.parquet_dir / f"{symbol.replace('/', '_')}_{timeframe}.parquet" columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
df_insert = df_copy[columns]
# Load existing data if file exists # Insert data directly into DuckDB (ignore duplicates)
if parquet_file.exists(): # Note: id column is auto-generated, so we don't include it
try: self.conn.execute("""
existing_df = pd.read_parquet(parquet_file) INSERT INTO ohlcv_data (symbol, timeframe, timestamp, open, high, low, close, volume, created_at)
# Combine with new data SELECT symbol, timeframe, timestamp, open, high, low, close, volume, created_at
df_copy = pd.concat([existing_df, df_copy], ignore_index=True) FROM df_insert
# Remove duplicates ON CONFLICT DO NOTHING
df_copy = df_copy.drop_duplicates(subset=['timestamp'], keep='last') """)
df_copy = df_copy.sort_values('timestamp')
except Exception as e:
logger.warning(f"Could not load existing parquet: {e}")
# Save to parquet # Update metadata
df_copy.to_parquet(parquet_file, index=False, compression='snappy') result = self.conn.execute("""
SELECT
MIN(timestamp) as first_ts,
MAX(timestamp) as last_ts,
COUNT(*) as count
FROM ohlcv_data
WHERE symbol = ? AND timeframe = ?
""", (symbol, timeframe)).fetchone()
# Update metadata in DuckDB first_ts, last_ts, count = result
first_ts = int(df_copy['timestamp'].min())
last_ts = int(df_copy['timestamp'].max())
count = len(df_copy)
now_ts = int(datetime.now().timestamp() * 1000) now_ts = int(datetime.now().timestamp() * 1000)
self.conn.execute(""" self.conn.execute("""
INSERT OR REPLACE INTO cache_metadata INSERT OR REPLACE INTO cache_metadata
(symbol, timeframe, parquet_path, first_timestamp, last_timestamp, candle_count, last_update) (symbol, timeframe, parquet_path, first_timestamp, last_timestamp, candle_count, last_update)
VALUES (?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?)
""", (symbol, timeframe, str(parquet_file), first_ts, last_ts, count, now_ts)) """, (symbol, timeframe, '', first_ts, last_ts, count, now_ts))
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in Parquet (total: {count})") logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in DuckDB (total: {count})")
return len(df) return len(df)
except Exception as e: except Exception as e:
@@ -158,7 +191,7 @@ class DuckDBStorage:
end_time: Optional[datetime] = None, end_time: Optional[datetime] = None,
limit: Optional[int] = None) -> Optional[pd.DataFrame]: limit: Optional[int] = None) -> Optional[pd.DataFrame]:
""" """
Query OHLCV data directly from Parquet using DuckDB Query OHLCV data directly from DuckDB table
Args: Args:
symbol: Trading symbol symbol: Trading symbol
@@ -171,26 +204,10 @@ class DuckDBStorage:
DataFrame with OHLCV data DataFrame with OHLCV data
""" """
try: try:
# Get parquet file path from metadata # Build query
result = self.conn.execute(""" query = """
SELECT parquet_path FROM cache_metadata
WHERE symbol = ? AND timeframe = ?
""", (symbol, timeframe)).fetchone()
if not result:
logger.debug(f"No data found for {symbol} {timeframe}")
return None
parquet_path = result[0]
if not Path(parquet_path).exists():
logger.warning(f"Parquet file not found: {parquet_path}")
return None
# Build query - DuckDB can query Parquet directly!
query = f"""
SELECT timestamp, open, high, low, close, volume SELECT timestamp, open, high, low, close, volume
FROM read_parquet('{parquet_path}') FROM ohlcv_data
WHERE symbol = ? AND timeframe = ? WHERE symbol = ? AND timeframe = ?
""" """
params = [symbol, timeframe] params = [symbol, timeframe]
@@ -219,7 +236,7 @@ class DuckDBStorage:
df = df.set_index('timestamp') df = df.set_index('timestamp')
df = df.sort_index() df = df.sort_index()
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from Parquet") logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from DuckDB")
return df return df
except Exception as e: except Exception as e: