wip duckdb

This commit is contained in:
Dobromir Popov
2025-10-24 16:19:42 +03:00
parent 809c32e7a8
commit 4b8f44d859
2 changed files with 200 additions and 180 deletions

View File

@@ -30,8 +30,8 @@ class DuckDBStorage:
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Parquet storage directory
self.parquet_dir = self.db_path.parent / "parquet_store"
# Parquet storage directory (only for annotation snapshots)
self.parquet_dir = self.db_path.parent / "annotation_snapshots"
self.parquet_dir.mkdir(parents=True, exist_ok=True)
# Connect to DuckDB
@@ -41,12 +41,42 @@ class DuckDBStorage:
self._init_schema()
logger.info(f"DuckDB storage initialized: {self.db_path}")
logger.info(f"Parquet storage: {self.parquet_dir}")
logger.info(f"Annotation snapshots: {self.parquet_dir}")
def _init_schema(self):
"""Initialize database schema with Parquet integration"""
"""Initialize database schema - all data in DuckDB tables"""
# Create annotations table (metadata only)
# Create OHLCV data table - stores ALL candles
self.conn.execute("""
CREATE SEQUENCE IF NOT EXISTS ohlcv_id_seq START 1
""")
self.conn.execute("""
CREATE TABLE IF NOT EXISTS ohlcv_data (
id INTEGER PRIMARY KEY DEFAULT nextval('ohlcv_id_seq'),
symbol VARCHAR NOT NULL,
timeframe VARCHAR NOT NULL,
timestamp BIGINT NOT NULL,
open DOUBLE NOT NULL,
high DOUBLE NOT NULL,
low DOUBLE NOT NULL,
close DOUBLE NOT NULL,
volume DOUBLE NOT NULL,
created_at BIGINT NOT NULL,
UNIQUE(symbol, timeframe, timestamp)
)
""")
# Create indexes for fast queries
self.conn.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
ON ohlcv_data(symbol, timeframe)
""")
self.conn.execute("""
CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
ON ohlcv_data(timestamp)
""")
# Create annotations table
self.conn.execute("""
CREATE TABLE IF NOT EXISTS annotations (
annotation_id VARCHAR PRIMARY KEY,
@@ -72,7 +102,7 @@ class DuckDBStorage:
CREATE TABLE IF NOT EXISTS cache_metadata (
symbol VARCHAR NOT NULL,
timeframe VARCHAR NOT NULL,
parquet_path VARCHAR NOT NULL,
parquet_path VARCHAR,
first_timestamp BIGINT NOT NULL,
last_timestamp BIGINT NOT NULL,
candle_count INTEGER NOT NULL,
@@ -81,11 +111,11 @@ class DuckDBStorage:
)
""")
logger.info("DuckDB schema initialized")
logger.info("DuckDB schema initialized (all data in tables)")
def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
"""
Store OHLCV data as Parquet file and register in DuckDB
Store OHLCV data directly in DuckDB table
Args:
symbol: Trading symbol
@@ -113,38 +143,41 @@ class DuckDBStorage:
# Add metadata
df_copy['symbol'] = symbol
df_copy['timeframe'] = timeframe
df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
# Define parquet file path
parquet_file = self.parquet_dir / f"{symbol.replace('/', '_')}_{timeframe}.parquet"
# Select columns in correct order
columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
df_insert = df_copy[columns]
# Load existing data if file exists
if parquet_file.exists():
try:
existing_df = pd.read_parquet(parquet_file)
# Combine with new data
df_copy = pd.concat([existing_df, df_copy], ignore_index=True)
# Remove duplicates
df_copy = df_copy.drop_duplicates(subset=['timestamp'], keep='last')
df_copy = df_copy.sort_values('timestamp')
except Exception as e:
logger.warning(f"Could not load existing parquet: {e}")
# Insert data directly into DuckDB (ignore duplicates)
# Note: id column is auto-generated, so we don't include it
self.conn.execute("""
INSERT INTO ohlcv_data (symbol, timeframe, timestamp, open, high, low, close, volume, created_at)
SELECT symbol, timeframe, timestamp, open, high, low, close, volume, created_at
FROM df_insert
ON CONFLICT DO NOTHING
""")
# Save to parquet
df_copy.to_parquet(parquet_file, index=False, compression='snappy')
# Update metadata
result = self.conn.execute("""
SELECT
MIN(timestamp) as first_ts,
MAX(timestamp) as last_ts,
COUNT(*) as count
FROM ohlcv_data
WHERE symbol = ? AND timeframe = ?
""", (symbol, timeframe)).fetchone()
# Update metadata in DuckDB
first_ts = int(df_copy['timestamp'].min())
last_ts = int(df_copy['timestamp'].max())
count = len(df_copy)
first_ts, last_ts, count = result
now_ts = int(datetime.now().timestamp() * 1000)
self.conn.execute("""
INSERT OR REPLACE INTO cache_metadata
(symbol, timeframe, parquet_path, first_timestamp, last_timestamp, candle_count, last_update)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (symbol, timeframe, str(parquet_file), first_ts, last_ts, count, now_ts))
""", (symbol, timeframe, '', first_ts, last_ts, count, now_ts))
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in Parquet (total: {count})")
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in DuckDB (total: {count})")
return len(df)
except Exception as e:
@@ -158,7 +191,7 @@ class DuckDBStorage:
end_time: Optional[datetime] = None,
limit: Optional[int] = None) -> Optional[pd.DataFrame]:
"""
Query OHLCV data directly from Parquet using DuckDB
Query OHLCV data directly from DuckDB table
Args:
symbol: Trading symbol
@@ -171,26 +204,10 @@ class DuckDBStorage:
DataFrame with OHLCV data
"""
try:
# Get parquet file path from metadata
result = self.conn.execute("""
SELECT parquet_path FROM cache_metadata
WHERE symbol = ? AND timeframe = ?
""", (symbol, timeframe)).fetchone()
if not result:
logger.debug(f"No data found for {symbol} {timeframe}")
return None
parquet_path = result[0]
if not Path(parquet_path).exists():
logger.warning(f"Parquet file not found: {parquet_path}")
return None
# Build query - DuckDB can query Parquet directly!
query = f"""
# Build query
query = """
SELECT timestamp, open, high, low, close, volume
FROM read_parquet('{parquet_path}')
FROM ohlcv_data
WHERE symbol = ? AND timeframe = ?
"""
params = [symbol, timeframe]
@@ -219,7 +236,7 @@ class DuckDBStorage:
df = df.set_index('timestamp')
df = df.sort_index()
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from Parquet")
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from DuckDB")
return df
except Exception as e: