wip duckdb
This commit is contained in:
@@ -30,8 +30,8 @@ class DuckDBStorage:
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Parquet storage directory
|
||||
self.parquet_dir = self.db_path.parent / "parquet_store"
|
||||
# Parquet storage directory (only for annotation snapshots)
|
||||
self.parquet_dir = self.db_path.parent / "annotation_snapshots"
|
||||
self.parquet_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Connect to DuckDB
|
||||
@@ -41,12 +41,42 @@ class DuckDBStorage:
|
||||
self._init_schema()
|
||||
|
||||
logger.info(f"DuckDB storage initialized: {self.db_path}")
|
||||
logger.info(f"Parquet storage: {self.parquet_dir}")
|
||||
logger.info(f"Annotation snapshots: {self.parquet_dir}")
|
||||
|
||||
def _init_schema(self):
|
||||
"""Initialize database schema with Parquet integration"""
|
||||
"""Initialize database schema - all data in DuckDB tables"""
|
||||
|
||||
# Create annotations table (metadata only)
|
||||
# Create OHLCV data table - stores ALL candles
|
||||
self.conn.execute("""
|
||||
CREATE SEQUENCE IF NOT EXISTS ohlcv_id_seq START 1
|
||||
""")
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS ohlcv_data (
|
||||
id INTEGER PRIMARY KEY DEFAULT nextval('ohlcv_id_seq'),
|
||||
symbol VARCHAR NOT NULL,
|
||||
timeframe VARCHAR NOT NULL,
|
||||
timestamp BIGINT NOT NULL,
|
||||
open DOUBLE NOT NULL,
|
||||
high DOUBLE NOT NULL,
|
||||
low DOUBLE NOT NULL,
|
||||
close DOUBLE NOT NULL,
|
||||
volume DOUBLE NOT NULL,
|
||||
created_at BIGINT NOT NULL,
|
||||
UNIQUE(symbol, timeframe, timestamp)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for fast queries
|
||||
self.conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_ohlcv_symbol_timeframe
|
||||
ON ohlcv_data(symbol, timeframe)
|
||||
""")
|
||||
self.conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_ohlcv_timestamp
|
||||
ON ohlcv_data(timestamp)
|
||||
""")
|
||||
|
||||
# Create annotations table
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS annotations (
|
||||
annotation_id VARCHAR PRIMARY KEY,
|
||||
@@ -72,7 +102,7 @@ class DuckDBStorage:
|
||||
CREATE TABLE IF NOT EXISTS cache_metadata (
|
||||
symbol VARCHAR NOT NULL,
|
||||
timeframe VARCHAR NOT NULL,
|
||||
parquet_path VARCHAR NOT NULL,
|
||||
parquet_path VARCHAR,
|
||||
first_timestamp BIGINT NOT NULL,
|
||||
last_timestamp BIGINT NOT NULL,
|
||||
candle_count INTEGER NOT NULL,
|
||||
@@ -81,11 +111,11 @@ class DuckDBStorage:
|
||||
)
|
||||
""")
|
||||
|
||||
logger.info("DuckDB schema initialized")
|
||||
logger.info("DuckDB schema initialized (all data in tables)")
|
||||
|
||||
def store_ohlcv_data(self, symbol: str, timeframe: str, df: pd.DataFrame) -> int:
|
||||
"""
|
||||
Store OHLCV data as Parquet file and register in DuckDB
|
||||
Store OHLCV data directly in DuckDB table
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
@@ -113,38 +143,41 @@ class DuckDBStorage:
|
||||
# Add metadata
|
||||
df_copy['symbol'] = symbol
|
||||
df_copy['timeframe'] = timeframe
|
||||
df_copy['created_at'] = int(datetime.now().timestamp() * 1000)
|
||||
|
||||
# Define parquet file path
|
||||
parquet_file = self.parquet_dir / f"{symbol.replace('/', '_')}_{timeframe}.parquet"
|
||||
# Select columns in correct order
|
||||
columns = ['symbol', 'timeframe', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'created_at']
|
||||
df_insert = df_copy[columns]
|
||||
|
||||
# Load existing data if file exists
|
||||
if parquet_file.exists():
|
||||
try:
|
||||
existing_df = pd.read_parquet(parquet_file)
|
||||
# Combine with new data
|
||||
df_copy = pd.concat([existing_df, df_copy], ignore_index=True)
|
||||
# Remove duplicates
|
||||
df_copy = df_copy.drop_duplicates(subset=['timestamp'], keep='last')
|
||||
df_copy = df_copy.sort_values('timestamp')
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load existing parquet: {e}")
|
||||
# Insert data directly into DuckDB (ignore duplicates)
|
||||
# Note: id column is auto-generated, so we don't include it
|
||||
self.conn.execute("""
|
||||
INSERT INTO ohlcv_data (symbol, timeframe, timestamp, open, high, low, close, volume, created_at)
|
||||
SELECT symbol, timeframe, timestamp, open, high, low, close, volume, created_at
|
||||
FROM df_insert
|
||||
ON CONFLICT DO NOTHING
|
||||
""")
|
||||
|
||||
# Save to parquet
|
||||
df_copy.to_parquet(parquet_file, index=False, compression='snappy')
|
||||
# Update metadata
|
||||
result = self.conn.execute("""
|
||||
SELECT
|
||||
MIN(timestamp) as first_ts,
|
||||
MAX(timestamp) as last_ts,
|
||||
COUNT(*) as count
|
||||
FROM ohlcv_data
|
||||
WHERE symbol = ? AND timeframe = ?
|
||||
""", (symbol, timeframe)).fetchone()
|
||||
|
||||
# Update metadata in DuckDB
|
||||
first_ts = int(df_copy['timestamp'].min())
|
||||
last_ts = int(df_copy['timestamp'].max())
|
||||
count = len(df_copy)
|
||||
first_ts, last_ts, count = result
|
||||
now_ts = int(datetime.now().timestamp() * 1000)
|
||||
|
||||
self.conn.execute("""
|
||||
INSERT OR REPLACE INTO cache_metadata
|
||||
(symbol, timeframe, parquet_path, first_timestamp, last_timestamp, candle_count, last_update)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (symbol, timeframe, str(parquet_file), first_ts, last_ts, count, now_ts))
|
||||
""", (symbol, timeframe, '', first_ts, last_ts, count, now_ts))
|
||||
|
||||
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in Parquet (total: {count})")
|
||||
logger.info(f"Stored {len(df)} candles for {symbol} {timeframe} in DuckDB (total: {count})")
|
||||
return len(df)
|
||||
|
||||
except Exception as e:
|
||||
@@ -158,7 +191,7 @@ class DuckDBStorage:
|
||||
end_time: Optional[datetime] = None,
|
||||
limit: Optional[int] = None) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
Query OHLCV data directly from Parquet using DuckDB
|
||||
Query OHLCV data directly from DuckDB table
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
@@ -171,26 +204,10 @@ class DuckDBStorage:
|
||||
DataFrame with OHLCV data
|
||||
"""
|
||||
try:
|
||||
# Get parquet file path from metadata
|
||||
result = self.conn.execute("""
|
||||
SELECT parquet_path FROM cache_metadata
|
||||
WHERE symbol = ? AND timeframe = ?
|
||||
""", (symbol, timeframe)).fetchone()
|
||||
|
||||
if not result:
|
||||
logger.debug(f"No data found for {symbol} {timeframe}")
|
||||
return None
|
||||
|
||||
parquet_path = result[0]
|
||||
|
||||
if not Path(parquet_path).exists():
|
||||
logger.warning(f"Parquet file not found: {parquet_path}")
|
||||
return None
|
||||
|
||||
# Build query - DuckDB can query Parquet directly!
|
||||
query = f"""
|
||||
# Build query
|
||||
query = """
|
||||
SELECT timestamp, open, high, low, close, volume
|
||||
FROM read_parquet('{parquet_path}')
|
||||
FROM ohlcv_data
|
||||
WHERE symbol = ? AND timeframe = ?
|
||||
"""
|
||||
params = [symbol, timeframe]
|
||||
@@ -219,7 +236,7 @@ class DuckDBStorage:
|
||||
df = df.set_index('timestamp')
|
||||
df = df.sort_index()
|
||||
|
||||
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from Parquet")
|
||||
logger.debug(f"Retrieved {len(df)} candles for {symbol} {timeframe} from DuckDB")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user