4728 lines
209 KiB
Python
4728 lines
209 KiB
Python
"""
|
|
Multi-Timeframe, Multi-Symbol Data Provider
|
|
|
|
This module consolidates all data functionality including:
|
|
- Historical data fetching from Binance API
|
|
- Real-time data streaming via WebSocket
|
|
- Multi-timeframe candle generation
|
|
- Caching and data management
|
|
- Technical indicators calculation
|
|
- Williams Market Structure pivot points with monthly data analysis
|
|
- Pivot-based feature normalization for improved model training
|
|
- Centralized data distribution to multiple subscribers (AI models, dashboard, etc.)
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import time
|
|
import uuid
|
|
import websockets
|
|
import requests
|
|
import pandas as pd
|
|
import numpy as np
|
|
import pickle
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple, Any, Callable
|
|
from dataclasses import dataclass, field
|
|
import ta
|
|
from threading import Thread, Lock
|
|
from collections import deque
|
|
import math
|
|
|
|
from .config import get_config
|
|
from .tick_aggregator import RealTimeTickAggregator, RawTick, OHLCVBar
|
|
from .cnn_monitor import log_cnn_prediction
|
|
from .williams_market_structure import WilliamsMarketStructure, PivotPoint, TrendLevel
|
|
from .enhanced_cob_websocket import EnhancedCOBWebSocket, get_enhanced_cob_websocket
|
|
from .cob_integration import COBIntegration
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class PivotBounds:
|
|
"""Pivot-based normalization bounds derived from Williams Market Structure"""
|
|
symbol: str
|
|
price_max: float
|
|
price_min: float
|
|
volume_max: float
|
|
volume_min: float
|
|
pivot_support_levels: List[float]
|
|
pivot_resistance_levels: List[float]
|
|
pivot_context: Dict[str, Any]
|
|
created_timestamp: datetime
|
|
data_period_start: datetime
|
|
data_period_end: datetime
|
|
total_candles_analyzed: int
|
|
|
|
def get_price_range(self) -> float:
|
|
"""Get price range for normalization"""
|
|
return self.price_max - self.price_min
|
|
|
|
def normalize_price(self, price: float) -> float:
|
|
"""Normalize price using pivot bounds"""
|
|
return (price - self.price_min) / self.get_price_range()
|
|
|
|
def get_nearest_support_distance(self, current_price: float) -> float:
|
|
"""Get distance to nearest support level (normalized)"""
|
|
if not self.pivot_support_levels:
|
|
return 0.5
|
|
distances = [abs(current_price - s) for s in self.pivot_support_levels]
|
|
return min(distances) / self.get_price_range()
|
|
|
|
def get_nearest_resistance_distance(self, current_price: float) -> float:
|
|
"""Get distance to nearest resistance level (normalized)"""
|
|
if not self.pivot_resistance_levels:
|
|
return 0.5
|
|
distances = [abs(current_price - r) for r in self.pivot_resistance_levels]
|
|
return min(distances) / self.get_price_range()
|
|
|
|
@dataclass
|
|
class MarketTick:
|
|
"""Standardized market tick data structure"""
|
|
symbol: str
|
|
timestamp: datetime
|
|
price: float
|
|
volume: float
|
|
quantity: float
|
|
side: str # 'buy' or 'sell'
|
|
trade_id: str
|
|
is_buyer_maker: bool
|
|
raw_data: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
@dataclass
|
|
class DataSubscriber:
|
|
"""Data subscriber information"""
|
|
subscriber_id: str
|
|
callback: Callable[[MarketTick], None]
|
|
symbols: List[str]
|
|
active: bool = True
|
|
last_update: datetime = field(default_factory=datetime.now)
|
|
tick_count: int = 0
|
|
subscriber_name: str = "unknown"
|
|
|
|
class DataProvider:
|
|
"""Unified data provider for historical and real-time market data with centralized distribution"""
|
|
|
|
def __init__(self, symbols: List[str] = None, timeframes: List[str] = None):
|
|
"""Initialize the data provider"""
|
|
self.config = get_config()
|
|
# Fixed symbols and timeframes for caching
|
|
self.symbols = ['ETH/USDT', 'BTC/USDT']
|
|
self.timeframes = ['1s', '1m', '1h', '1d']
|
|
|
|
# Cache settings (initialize first)
|
|
self.cache_enabled = True
|
|
self.cache_dir = Path('cache')
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Data storage - cached OHLCV data (1500 candles each)
|
|
self.cached_data = {} # {symbol: {timeframe: DataFrame}}
|
|
self.real_time_data = {} # {symbol: {timeframe: deque}}
|
|
self.current_prices = {} # {symbol: float}
|
|
|
|
# Initialize cached data structure
|
|
for symbol in self.symbols:
|
|
self.cached_data[symbol] = {}
|
|
for timeframe in self.timeframes:
|
|
self.cached_data[symbol][timeframe] = pd.DataFrame()
|
|
|
|
# Pivot-based normalization system
|
|
self.pivot_bounds: Dict[str, PivotBounds] = {} # {symbol: PivotBounds}
|
|
self.pivot_cache_dir = self.cache_dir / 'pivot_bounds'
|
|
self.pivot_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
self.pivot_refresh_interval = timedelta(days=1) # Refresh pivot bounds daily
|
|
self.monthly_data_cache_dir = self.cache_dir / 'monthly_1s_data'
|
|
self.monthly_data_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Enhanced WebSocket integration
|
|
self.enhanced_cob_websocket: Optional[EnhancedCOBWebSocket] = None
|
|
self.websocket_tasks = {}
|
|
self.is_streaming = False
|
|
self.data_lock = Lock()
|
|
|
|
# COB data from enhanced WebSocket
|
|
self.cob_websocket_data: Dict[str, Dict] = {} # Latest COB data from WebSocket
|
|
self.cob_websocket_status: Dict[str, str] = {} # WebSocket status per symbol
|
|
|
|
# Subscriber management for centralized data distribution
|
|
self.subscribers: Dict[str, DataSubscriber] = {}
|
|
self.subscriber_lock = Lock()
|
|
self.tick_buffers: Dict[str, deque] = {}
|
|
self.buffer_size = 1000 # Keep last 1000 ticks per symbol
|
|
|
|
# Initialize tick buffers
|
|
for symbol in self.symbols:
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
self.tick_buffers[binance_symbol] = deque(maxlen=self.buffer_size)
|
|
|
|
# BOM (Book of Market) data caching - 1s resolution for last 5 minutes
|
|
self.bom_cache_duration = 300 # 5 minutes in seconds
|
|
self.bom_feature_count = 120 # Number of BOM features per timestamp
|
|
self.bom_data_cache: Dict[str, deque] = {} # {symbol: deque of (timestamp, bom_features)}
|
|
|
|
# Initialize BOM cache for each symbol
|
|
for symbol in self.symbols:
|
|
# Store 300 seconds worth of 1s BOM data
|
|
self.bom_data_cache[symbol] = deque(maxlen=self.bom_cache_duration)
|
|
|
|
# Initialize tick aggregator for raw tick processing
|
|
binance_symbols = [symbol.replace('/', '').upper() for symbol in self.symbols]
|
|
self.tick_aggregator = RealTimeTickAggregator(symbols=binance_symbols)
|
|
|
|
# Raw tick and OHLCV bar callbacks
|
|
self.raw_tick_callbacks = []
|
|
self.ohlcv_bar_callbacks = []
|
|
|
|
# Performance tracking for subscribers
|
|
self.distribution_stats = {
|
|
'total_ticks_received': 0,
|
|
'total_ticks_distributed': 0,
|
|
'distribution_errors': 0,
|
|
'last_tick_time': {},
|
|
'ticks_per_symbol': {symbol.replace('/', '').upper(): 0 for symbol in self.symbols},
|
|
'raw_ticks_processed': 0,
|
|
'ohlcv_bars_created': 0,
|
|
'patterns_detected': 0
|
|
}
|
|
|
|
# Data validation
|
|
self.last_prices = {symbol.replace('/', '').upper(): 0.0 for symbol in self.symbols}
|
|
self.price_change_threshold = 0.1 # 10% price change threshold for validation
|
|
|
|
# Timeframe conversion
|
|
self.timeframe_seconds = {
|
|
'1s': 1, '1m': 60, '5m': 300, '15m': 900, '30m': 1800,
|
|
'1h': 3600, '4h': 14400, '1d': 86400
|
|
}
|
|
|
|
# Williams Market Structure integration
|
|
self.williams_structure: Dict[str, WilliamsMarketStructure] = {}
|
|
for symbol in self.symbols:
|
|
self.williams_structure[symbol] = WilliamsMarketStructure(min_pivot_distance=3)
|
|
|
|
# Pivot point caching
|
|
self.pivot_points_cache: Dict[str, Dict[int, TrendLevel]] = {} # {symbol: {level: TrendLevel}}
|
|
self.last_pivot_calculation: Dict[str, datetime] = {}
|
|
self.pivot_calculation_interval = timedelta(minutes=5) # Recalculate every 5 minutes
|
|
|
|
# Auto-fix corrupted cache files on startup
|
|
self._auto_fix_corrupted_cache()
|
|
|
|
# Load existing pivot bounds from cache
|
|
self._load_all_pivot_bounds()
|
|
|
|
# COB (Consolidated Order Book) data system using WebSocket
|
|
self.cob_integration: Optional[COBIntegration] = None
|
|
|
|
# COB data storage - 15 minutes of raw ticks and 1s aggregated data
|
|
self.cob_raw_ticks: Dict[str, deque] = {} # Raw COB ticks (15 min)
|
|
self.cob_1s_aggregated: Dict[str, deque] = {} # 1s aggregated COB data with $1 buckets
|
|
|
|
# Initialize COB data structures
|
|
for symbol in self.symbols:
|
|
# Raw ticks: 15 minutes at ~100 ticks/second = ~90,000 ticks
|
|
self.cob_raw_ticks[symbol] = deque(maxlen=90000)
|
|
# 1s aggregated: 15 minutes = 900 seconds
|
|
self.cob_1s_aggregated[symbol] = deque(maxlen=900)
|
|
|
|
# COB callbacks for data distribution
|
|
self.cob_data_callbacks: List[Callable] = []
|
|
self.cob_aggregated_callbacks: List[Callable] = []
|
|
|
|
# Training data collection (simplified)
|
|
self.training_data_cache: Dict[str, deque] = {}
|
|
self.training_data_callbacks: List[Callable] = []
|
|
self.model_prediction_callbacks: List[Callable] = []
|
|
|
|
# Initialize training data cache
|
|
for symbol in self.symbols:
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
self.training_data_cache[binance_symbol] = deque(maxlen=1000)
|
|
|
|
# Data collection threads
|
|
self.data_collection_active = False
|
|
|
|
# COB data collection
|
|
self.cob_collection_active = False
|
|
self.cob_collection_thread = None
|
|
|
|
# Training data collection
|
|
self.training_data_collection_active = False
|
|
self.training_data_thread = None
|
|
|
|
# Price-level bucketing
|
|
self.bucketed_cob_data: Dict[str, Dict] = {}
|
|
self.bucket_sizes = [1, 10] # $1 and $10 buckets
|
|
self.bucketed_cob_callbacks: Dict[int, List[Callable]] = {size: [] for size in self.bucket_sizes}
|
|
|
|
# Automatic data maintenance
|
|
self.data_maintenance_active = False
|
|
self.data_maintenance_thread = None
|
|
|
|
# Timeframe intervals in seconds for automatic updates
|
|
self.timeframe_intervals = {
|
|
'1s': 1,
|
|
'1m': 60,
|
|
'1h': 3600,
|
|
'1d': 86400
|
|
}
|
|
|
|
logger.info(f"DataProvider initialized for symbols: {self.symbols}")
|
|
logger.info(f"Timeframes: {self.timeframes}")
|
|
logger.info("Automatic data maintenance enabled")
|
|
logger.info("Centralized data distribution enabled")
|
|
logger.info("Pivot-based normalization system enabled")
|
|
logger.info("Williams Market Structure integration enabled")
|
|
logger.info("COB and training data collection enabled")
|
|
|
|
# Rate limiting
|
|
self.last_request_time = {}
|
|
self.request_interval = 0.5 # 500ms between requests to avoid rate limits
|
|
self.retry_delay = 60 # 1 minute retry delay for 451 errors
|
|
self.max_retries = 3
|
|
|
|
# Start automatic data maintenance
|
|
self.start_automatic_data_maintenance()
|
|
|
|
# Start COB WebSocket integration
|
|
self.start_cob_websocket_integration()
|
|
|
|
def start_automatic_data_maintenance(self):
|
|
"""Start automatic data maintenance system"""
|
|
if self.data_maintenance_active:
|
|
logger.warning("Data maintenance already active")
|
|
return
|
|
|
|
self.data_maintenance_active = True
|
|
self.data_maintenance_thread = Thread(target=self._data_maintenance_worker, daemon=True)
|
|
self.data_maintenance_thread.start()
|
|
logger.info("Automatic data maintenance started")
|
|
|
|
def stop_automatic_data_maintenance(self):
|
|
"""Stop automatic data maintenance system"""
|
|
self.data_maintenance_active = False
|
|
if self.data_maintenance_thread and self.data_maintenance_thread.is_alive():
|
|
self.data_maintenance_thread.join(timeout=5)
|
|
logger.info("Automatic data maintenance stopped")
|
|
|
|
def _data_maintenance_worker(self):
|
|
"""Worker thread for automatic data maintenance"""
|
|
logger.info("Data maintenance worker started")
|
|
|
|
# Initial data load
|
|
self._initial_data_load()
|
|
|
|
# Track last update times for each symbol/timeframe
|
|
last_updates = {}
|
|
for symbol in self.symbols:
|
|
last_updates[symbol] = {}
|
|
for timeframe in self.timeframes:
|
|
last_updates[symbol][timeframe] = 0
|
|
|
|
while self.data_maintenance_active:
|
|
try:
|
|
current_time = time.time()
|
|
|
|
# Check each symbol/timeframe for updates
|
|
for symbol in self.symbols:
|
|
for timeframe in self.timeframes:
|
|
interval = self.timeframe_intervals[timeframe]
|
|
half_interval = interval / 2
|
|
|
|
# Update every half candle period
|
|
if current_time - last_updates[symbol][timeframe] >= half_interval:
|
|
self._update_cached_data(symbol, timeframe)
|
|
last_updates[symbol][timeframe] = current_time
|
|
|
|
# Sleep for 1 second before next check
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in data maintenance worker: {e}")
|
|
time.sleep(10) # Wait longer on error
|
|
|
|
def _initial_data_load(self):
|
|
"""Load initial 1500 candles for each symbol/timeframe"""
|
|
logger.info("Starting initial data load (1500 candles each)")
|
|
|
|
for symbol in self.symbols:
|
|
for timeframe in self.timeframes:
|
|
try:
|
|
logger.info(f"Loading initial data for {symbol} {timeframe}")
|
|
df = self._fetch_from_binance(symbol, timeframe, 1500)
|
|
|
|
if df is None or df.empty:
|
|
logger.warning(f"Binance failed for {symbol} {timeframe}, trying MEXC")
|
|
df = self._fetch_from_mexc(symbol, timeframe, 1500)
|
|
|
|
if df is not None and not df.empty:
|
|
# Ensure proper datetime index
|
|
df = self._ensure_datetime_index(df)
|
|
|
|
# Store in cached data
|
|
self.cached_data[symbol][timeframe] = df
|
|
logger.info(f"Loaded {len(df)} candles for {symbol} {timeframe}")
|
|
else:
|
|
logger.error(f"Failed to load initial data for {symbol} {timeframe}")
|
|
|
|
# Rate limiting between requests
|
|
time.sleep(0.5)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading initial data for {symbol} {timeframe}: {e}")
|
|
|
|
logger.info("Initial data load completed")
|
|
|
|
def _update_cached_data(self, symbol: str, timeframe: str):
|
|
"""Update cached data by fetching last 2 candles"""
|
|
try:
|
|
# Fetch last 2 candles
|
|
df = self._fetch_from_binance(symbol, timeframe, 2)
|
|
|
|
if df is None or df.empty:
|
|
df = self._fetch_from_mexc(symbol, timeframe, 2)
|
|
|
|
if df is not None and not df.empty:
|
|
# Ensure proper datetime index
|
|
df = self._ensure_datetime_index(df)
|
|
|
|
# Get existing cached data
|
|
existing_df = self.cached_data[symbol][timeframe]
|
|
|
|
if not existing_df.empty:
|
|
# Merge new data with existing, avoiding duplicates
|
|
combined_df = pd.concat([existing_df, df], ignore_index=False)
|
|
combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
|
|
combined_df = combined_df.sort_index()
|
|
|
|
# Keep only last 1500 candles
|
|
self.cached_data[symbol][timeframe] = combined_df.tail(1500)
|
|
else:
|
|
self.cached_data[symbol][timeframe] = df
|
|
|
|
logger.debug(f"Updated cached data for {symbol} {timeframe}: {len(self.cached_data[symbol][timeframe])} candles")
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error updating cached data for {symbol} {timeframe}: {e}")
|
|
|
|
def start_cob_websocket_integration(self):
|
|
"""Start COB WebSocket integration using COBIntegration class"""
|
|
try:
|
|
logger.info("Starting COB WebSocket integration")
|
|
|
|
# Initialize COB integration
|
|
self.cob_integration = COBIntegration(data_provider=self, symbols=self.symbols)
|
|
|
|
# Add callback for COB data
|
|
self.cob_integration.add_dashboard_callback(self._on_cob_websocket_update)
|
|
|
|
# Start COB integration in background thread
|
|
cob_thread = Thread(target=self._run_cob_integration, daemon=True)
|
|
cob_thread.start()
|
|
|
|
# Start 1s aggregation worker
|
|
aggregation_thread = Thread(target=self._cob_aggregation_worker, daemon=True)
|
|
aggregation_thread.start()
|
|
|
|
logger.info("COB WebSocket integration started")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting COB WebSocket integration: {e}")
|
|
|
|
def _run_cob_integration(self):
|
|
"""Run COB integration in asyncio event loop"""
|
|
try:
|
|
# Create new event loop for this thread
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
# Run COB integration
|
|
loop.run_until_complete(self.cob_integration.start())
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error running COB integration: {e}")
|
|
|
|
def _on_cob_websocket_update(self, symbol: str, cob_data: Dict):
|
|
"""Handle COB updates from WebSocket"""
|
|
try:
|
|
# Extract the actual COB data from the wrapper
|
|
if 'data' in cob_data:
|
|
actual_data = cob_data['data']
|
|
else:
|
|
actual_data = cob_data
|
|
|
|
# Create raw tick entry
|
|
raw_tick = {
|
|
'symbol': symbol,
|
|
'timestamp': datetime.utcnow(),
|
|
'bids': actual_data.get('bids', [])[:50], # Top 50 levels
|
|
'asks': actual_data.get('asks', [])[:50], # Top 50 levels
|
|
'stats': actual_data.get('stats', {}),
|
|
'source': 'websocket'
|
|
}
|
|
|
|
# Store raw tick
|
|
self.cob_raw_ticks[symbol].append(raw_tick)
|
|
|
|
# Distribute to raw COB callbacks
|
|
for callback in self.cob_data_callbacks:
|
|
try:
|
|
callback(symbol, raw_tick)
|
|
except Exception as e:
|
|
logger.error(f"Error in COB callback: {e}")
|
|
|
|
logger.debug(f"Processed COB WebSocket update for {symbol}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing COB WebSocket update for {symbol}: {e}")
|
|
|
|
def _cob_aggregation_worker(self):
|
|
"""Worker thread for 1s COB aggregation with $1 price buckets"""
|
|
logger.info("Starting COB 1s aggregation worker")
|
|
|
|
# Track last aggregation time for each symbol
|
|
last_aggregation = {symbol: 0 for symbol in self.symbols}
|
|
|
|
while True:
|
|
try:
|
|
current_time = time.time()
|
|
current_second = int(current_time)
|
|
|
|
# Process each symbol
|
|
for symbol in self.symbols:
|
|
# Aggregate every second
|
|
if current_second > last_aggregation[symbol]:
|
|
self._aggregate_cob_1s(symbol, current_second - 1)
|
|
last_aggregation[symbol] = current_second
|
|
|
|
# Sleep until next second boundary
|
|
sleep_time = 1.0 - (current_time % 1.0)
|
|
time.sleep(max(0.1, sleep_time))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in COB aggregation worker: {e}")
|
|
time.sleep(1)
|
|
|
|
def _aggregate_cob_1s(self, symbol: str, target_second: int):
|
|
"""Aggregate COB data for 1 second with $1 price buckets and multi-timeframe imbalances"""
|
|
try:
|
|
# Get raw ticks for the target second
|
|
target_ticks = []
|
|
|
|
for tick in self.cob_raw_ticks[symbol]:
|
|
tick_timestamp = tick['timestamp']
|
|
|
|
# Handle both datetime and float timestamps
|
|
if isinstance(tick_timestamp, datetime):
|
|
tick_time = tick_timestamp.timestamp()
|
|
else:
|
|
tick_time = float(tick_timestamp)
|
|
|
|
# Check if tick is in target second
|
|
if target_second <= tick_time < target_second + 1:
|
|
target_ticks.append(tick)
|
|
|
|
if not target_ticks:
|
|
return
|
|
|
|
# Aggregate the ticks with $1 price buckets
|
|
aggregated_data = self._create_1s_cob_aggregation(symbol, target_ticks, target_second)
|
|
|
|
# Add multi-timeframe imbalance calculations
|
|
aggregated_data = self._add_multi_timeframe_imbalances(symbol, aggregated_data, target_second)
|
|
|
|
# Store aggregated data
|
|
self.cob_1s_aggregated[symbol].append(aggregated_data)
|
|
|
|
# Distribute to aggregated COB callbacks
|
|
for callback in self.cob_aggregated_callbacks:
|
|
try:
|
|
callback(symbol, aggregated_data)
|
|
except Exception as e:
|
|
logger.error(f"Error in COB aggregated callback: {e}")
|
|
|
|
logger.debug(f"Aggregated {len(target_ticks)} COB ticks for {symbol} at second {target_second}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error aggregating COB 1s for {symbol}: {e}")
|
|
|
|
def _add_multi_timeframe_imbalances(self, symbol: str, aggregated_data: Dict, current_second: int) -> Dict:
|
|
"""Add COB-based order book imbalances with configurable price ranges"""
|
|
try:
|
|
# Get price range based on symbol
|
|
price_range = self._get_price_range_for_symbol(symbol)
|
|
|
|
# Get latest COB data for current imbalance calculation
|
|
latest_cob = self.get_latest_cob_data(symbol)
|
|
current_imbalance = 0.0
|
|
|
|
if latest_cob:
|
|
current_imbalance = self._calculate_cob_imbalance(latest_cob, price_range)
|
|
|
|
# Get historical COB data for timeframe calculations
|
|
historical_cob_data = list(self.cob_raw_ticks[symbol]) if symbol in self.cob_raw_ticks else []
|
|
|
|
# Calculate imbalances for different timeframes using COB data
|
|
imbalances = {
|
|
'imbalance_1s': current_imbalance, # Current COB imbalance
|
|
'imbalance_5s': self._calculate_timeframe_cob_imbalance(historical_cob_data, 5, price_range),
|
|
'imbalance_15s': self._calculate_timeframe_cob_imbalance(historical_cob_data, 15, price_range),
|
|
'imbalance_60s': self._calculate_timeframe_cob_imbalance(historical_cob_data, 60, price_range)
|
|
}
|
|
|
|
# Add volume-weighted imbalances within price range
|
|
volume_imbalances = {
|
|
'volume_imbalance_1s': current_imbalance,
|
|
'volume_imbalance_5s': self._calculate_volume_weighted_imbalance(historical_cob_data, 5, price_range),
|
|
'volume_imbalance_15s': self._calculate_volume_weighted_imbalance(historical_cob_data, 15, price_range),
|
|
'volume_imbalance_60s': self._calculate_volume_weighted_imbalance(historical_cob_data, 60, price_range)
|
|
}
|
|
|
|
# Combine all imbalance metrics
|
|
all_imbalances = {**imbalances, **volume_imbalances}
|
|
|
|
# Add to aggregated data
|
|
aggregated_data.update(all_imbalances)
|
|
|
|
# Also add to stats section for compatibility
|
|
if 'stats' not in aggregated_data:
|
|
aggregated_data['stats'] = {}
|
|
aggregated_data['stats'].update(all_imbalances)
|
|
|
|
# Add price range information for debugging
|
|
aggregated_data['stats']['price_range_used'] = price_range
|
|
|
|
logger.debug(f"COB imbalances for {symbol} (±${price_range}): {current_imbalance:.4f}")
|
|
|
|
return aggregated_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating COB-based imbalances for {symbol}: {e}")
|
|
# Return original data with default imbalances
|
|
default_imbalances = {
|
|
'imbalance_1s': 0.0, 'imbalance_5s': 0.0, 'imbalance_15s': 0.0, 'imbalance_60s': 0.0,
|
|
'volume_imbalance_1s': 0.0, 'volume_imbalance_5s': 0.0, 'volume_imbalance_15s': 0.0, 'volume_imbalance_60s': 0.0
|
|
}
|
|
aggregated_data.update(default_imbalances)
|
|
return aggregated_data
|
|
|
|
def _get_price_range_for_symbol(self, symbol: str) -> float:
|
|
"""Get configurable price range for order book imbalance calculation"""
|
|
# Configurable price ranges per symbol
|
|
price_ranges = {
|
|
'ETH/USDT': 5.0, # $5 range for ETH
|
|
'BTC/USDT': 50.0, # $50 range for BTC
|
|
}
|
|
|
|
return price_ranges.get(symbol, 10.0) # Default $10 range for other symbols
|
|
|
|
def get_current_cob_imbalance(self, symbol: str) -> Dict[str, float]:
|
|
"""Get current COB imbalance metrics for a symbol"""
|
|
try:
|
|
price_range = self._get_price_range_for_symbol(symbol)
|
|
latest_cob = self.get_latest_cob_data(symbol)
|
|
|
|
if not latest_cob:
|
|
return {
|
|
'imbalance': 0.0,
|
|
'price_range': price_range,
|
|
'mid_price': 0.0,
|
|
'bid_volume_in_range': 0.0,
|
|
'ask_volume_in_range': 0.0
|
|
}
|
|
|
|
# Calculate detailed imbalance info
|
|
bids = latest_cob.get('bids', [])
|
|
asks = latest_cob.get('asks', [])
|
|
|
|
if not bids or not asks:
|
|
return {'imbalance': 0.0, 'price_range': price_range, 'mid_price': 0.0}
|
|
|
|
# Calculate mid price with proper safety checks
|
|
try:
|
|
if not bids or not asks or len(bids) == 0 or len(asks) == 0:
|
|
return {'imbalance': 0.0, 'price_range': price_range, 'mid_price': 0.0}
|
|
|
|
best_bid = float(bids[0][0])
|
|
best_ask = float(asks[0][0])
|
|
mid_price = (best_bid + best_ask) / 2.0
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error calculating mid price for {symbol}: {e}")
|
|
return {'imbalance': 0.0, 'price_range': price_range, 'mid_price': 0.0, 'error': str(e)}
|
|
|
|
# Calculate volumes in range with safety checks
|
|
price_min = mid_price - price_range
|
|
price_max = mid_price + price_range
|
|
|
|
bid_volume_in_range = 0.0
|
|
ask_volume_in_range = 0.0
|
|
|
|
try:
|
|
for price, vol in bids:
|
|
price = float(price)
|
|
vol = float(vol)
|
|
if price_min <= price <= mid_price:
|
|
bid_volume_in_range += vol
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing bid volumes for {symbol}: {e}")
|
|
|
|
try:
|
|
for price, vol in asks:
|
|
price = float(price)
|
|
vol = float(vol)
|
|
if mid_price <= price <= price_max:
|
|
ask_volume_in_range += vol
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing ask volumes for {symbol}: {e}")
|
|
|
|
total_volume = bid_volume_in_range + ask_volume_in_range
|
|
imbalance = (bid_volume_in_range - ask_volume_in_range) / total_volume if total_volume > 0 else 0.0
|
|
|
|
return {
|
|
'imbalance': imbalance,
|
|
'price_range': price_range,
|
|
'mid_price': mid_price,
|
|
'bid_volume_in_range': bid_volume_in_range,
|
|
'ask_volume_in_range': ask_volume_in_range,
|
|
'total_volume_in_range': total_volume,
|
|
'best_bid': best_bid,
|
|
'best_ask': best_ask
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting current COB imbalance for {symbol}: {e}")
|
|
return {'imbalance': 0.0, 'price_range': price_range, 'error': str(e)}
|
|
|
|
def _calculate_cob_imbalance(self, cob_data: Dict, price_range: float) -> float:
|
|
"""Calculate order book imbalance within specified price range around mid price"""
|
|
try:
|
|
bids = cob_data.get('bids', [])
|
|
asks = cob_data.get('asks', [])
|
|
|
|
if not bids or not asks:
|
|
return 0.0
|
|
|
|
# Calculate mid price with proper safety checks
|
|
try:
|
|
if not bids or not asks or len(bids) == 0 or len(asks) == 0:
|
|
return 0.0
|
|
|
|
best_bid = float(bids[0][0])
|
|
best_ask = float(asks[0][0])
|
|
|
|
if best_bid <= 0 or best_ask <= 0:
|
|
return 0.0
|
|
|
|
mid_price = (best_bid + best_ask) / 2.0
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error calculating mid price: {e}")
|
|
return 0.0
|
|
|
|
# Define price range around mid price
|
|
price_min = mid_price - price_range
|
|
price_max = mid_price + price_range
|
|
|
|
# Sum volumes within price range
|
|
bid_volume_in_range = 0.0
|
|
ask_volume_in_range = 0.0
|
|
|
|
# Sum bid volumes within range with safety checks
|
|
try:
|
|
for bid_price, bid_volume in bids:
|
|
bid_price = float(bid_price)
|
|
bid_volume = float(bid_volume)
|
|
if price_min <= bid_price <= mid_price:
|
|
bid_volume_in_range += bid_volume
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing bid volumes: {e}")
|
|
|
|
# Sum ask volumes within range with safety checks
|
|
try:
|
|
for ask_price, ask_volume in asks:
|
|
ask_price = float(ask_price)
|
|
ask_volume = float(ask_volume)
|
|
if mid_price <= ask_price <= price_max:
|
|
ask_volume_in_range += ask_volume
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing ask volumes: {e}")
|
|
|
|
# Calculate imbalance: (bid_volume - ask_volume) / (bid_volume + ask_volume)
|
|
total_volume = bid_volume_in_range + ask_volume_in_range
|
|
|
|
if total_volume > 0:
|
|
imbalance = (bid_volume_in_range - ask_volume_in_range) / total_volume
|
|
return imbalance
|
|
else:
|
|
return 0.0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating COB imbalance: {e}")
|
|
return 0.0
|
|
|
|
def _calculate_timeframe_cob_imbalance(self, historical_cob_data: List[Dict], seconds: int, price_range: float) -> float:
|
|
"""Calculate average COB imbalance over specified timeframe"""
|
|
try:
|
|
if not historical_cob_data or len(historical_cob_data) == 0:
|
|
return 0.0
|
|
|
|
# Get recent data within timeframe (approximate by using last N ticks)
|
|
# Assuming ~100 ticks per second, so N = seconds * 100
|
|
max_ticks = seconds * 100
|
|
recent_ticks = historical_cob_data[-max_ticks:] if len(historical_cob_data) > max_ticks else historical_cob_data
|
|
|
|
if not recent_ticks:
|
|
return 0.0
|
|
|
|
# Calculate imbalance for each tick and average
|
|
imbalances = []
|
|
for tick in recent_ticks:
|
|
imbalance = self._calculate_cob_imbalance(tick, price_range)
|
|
imbalances.append(imbalance)
|
|
|
|
if imbalances:
|
|
return sum(imbalances) / len(imbalances)
|
|
else:
|
|
return 0.0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating {seconds}s COB imbalance: {e}")
|
|
return 0.0
|
|
|
|
def _calculate_volume_weighted_imbalance(self, historical_cob_data: List[Dict], seconds: int, price_range: float) -> float:
|
|
"""Calculate volume-weighted average imbalance over timeframe"""
|
|
try:
|
|
if not historical_cob_data:
|
|
return 0.0
|
|
|
|
# Get recent data within timeframe
|
|
max_ticks = seconds * 100 # Approximate ticks per second
|
|
recent_ticks = historical_cob_data[-max_ticks:] if len(historical_cob_data) > max_ticks else historical_cob_data
|
|
|
|
if not recent_ticks:
|
|
return 0.0
|
|
|
|
total_weighted_imbalance = 0.0
|
|
total_volume = 0.0
|
|
|
|
for tick in recent_ticks:
|
|
imbalance = self._calculate_cob_imbalance(tick, price_range)
|
|
|
|
# Calculate total volume in range for weighting
|
|
bids = tick.get('bids', [])
|
|
asks = tick.get('asks', [])
|
|
|
|
if bids and asks and len(bids) > 0 and len(asks) > 0:
|
|
# Get mid price for this tick with proper safety checks
|
|
try:
|
|
best_bid = float(bids[0][0])
|
|
best_ask = float(asks[0][0])
|
|
mid_price = (best_bid + best_ask) / 2.0
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Skipping tick due to data format issue: {e}")
|
|
continue
|
|
|
|
# Calculate volume in range
|
|
price_min = mid_price - price_range
|
|
price_max = mid_price + price_range
|
|
|
|
tick_volume = 0.0
|
|
try:
|
|
for bid_price, bid_volume in bids:
|
|
bid_price = float(bid_price)
|
|
bid_volume = float(bid_volume)
|
|
if price_min <= bid_price <= mid_price:
|
|
tick_volume += bid_volume
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing bid volumes in weighted calculation: {e}")
|
|
|
|
try:
|
|
for ask_price, ask_volume in asks:
|
|
ask_price = float(ask_price)
|
|
ask_volume = float(ask_volume)
|
|
if mid_price <= ask_price <= price_max:
|
|
tick_volume += ask_volume
|
|
except (IndexError, KeyError, ValueError) as e:
|
|
logger.debug(f"Error processing ask volumes in weighted calculation: {e}")
|
|
|
|
if tick_volume > 0:
|
|
total_weighted_imbalance += imbalance * tick_volume
|
|
total_volume += tick_volume
|
|
|
|
if total_volume > 0:
|
|
return total_weighted_imbalance / total_volume
|
|
else:
|
|
return 0.0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating volume-weighted {seconds}s imbalance: {e}")
|
|
return 0.0
|
|
|
|
|
|
|
|
def _create_1s_cob_aggregation(self, symbol: str, ticks: List[Dict], timestamp: int) -> Dict:
|
|
"""Create 1s aggregation with $1 price buckets"""
|
|
try:
|
|
if not ticks:
|
|
return {}
|
|
|
|
# Initialize buckets
|
|
bid_buckets = {} # {price_bucket: total_volume}
|
|
ask_buckets = {} # {price_bucket: total_volume}
|
|
|
|
# Statistics tracking
|
|
all_mid_prices = []
|
|
all_spreads = []
|
|
all_imbalances = []
|
|
total_bid_volume = 0
|
|
total_ask_volume = 0
|
|
|
|
# Process each tick
|
|
for tick in ticks:
|
|
stats = tick.get('stats', {})
|
|
bids = tick.get('bids', [])
|
|
asks = tick.get('asks', [])
|
|
|
|
# Track statistics
|
|
mid_price = stats.get('mid_price', 0)
|
|
if mid_price > 0:
|
|
all_mid_prices.append(mid_price)
|
|
|
|
spread = stats.get('spread_bps', 0)
|
|
if spread > 0:
|
|
all_spreads.append(spread)
|
|
|
|
imbalance = stats.get('imbalance', 0)
|
|
all_imbalances.append(imbalance)
|
|
|
|
# Process bids with $1 buckets
|
|
for bid in bids:
|
|
if isinstance(bid, dict):
|
|
price = bid.get('price', 0)
|
|
volume = bid.get('volume', 0)
|
|
elif isinstance(bid, list) and len(bid) >= 2:
|
|
price = float(bid[0])
|
|
volume = float(bid[1])
|
|
else:
|
|
continue
|
|
|
|
if price > 0 and volume > 0:
|
|
# Create $1 bucket (floor to nearest dollar)
|
|
bucket = math.floor(price)
|
|
if bucket not in bid_buckets:
|
|
bid_buckets[bucket] = 0
|
|
bid_buckets[bucket] += volume
|
|
total_bid_volume += volume
|
|
|
|
# Process asks with $1 buckets
|
|
for ask in asks:
|
|
if isinstance(ask, dict):
|
|
price = ask.get('price', 0)
|
|
volume = ask.get('volume', 0)
|
|
elif isinstance(ask, list) and len(ask) >= 2:
|
|
price = float(ask[0])
|
|
volume = float(ask[1])
|
|
else:
|
|
continue
|
|
|
|
if price > 0 and volume > 0:
|
|
# Create $1 bucket (floor to nearest dollar)
|
|
bucket = math.floor(price)
|
|
if bucket not in ask_buckets:
|
|
ask_buckets[bucket] = 0
|
|
ask_buckets[bucket] += volume
|
|
total_ask_volume += volume
|
|
|
|
# Calculate aggregated statistics
|
|
avg_mid_price = sum(all_mid_prices) / len(all_mid_prices) if all_mid_prices else 0
|
|
avg_spread = sum(all_spreads) / len(all_spreads) if all_spreads else 0
|
|
avg_imbalance = sum(all_imbalances) / len(all_imbalances) if all_imbalances else 0
|
|
|
|
# Calculate current imbalance from total volumes
|
|
total_volume = total_bid_volume + total_ask_volume
|
|
current_imbalance = (total_bid_volume - total_ask_volume) / total_volume if total_volume > 0 else 0
|
|
|
|
# Create aggregated data structure
|
|
aggregated = {
|
|
'symbol': symbol,
|
|
'timestamp': timestamp,
|
|
'tick_count': len(ticks),
|
|
'bucket_size_usd': 1.0, # $1 buckets
|
|
'bid_buckets': dict(sorted(bid_buckets.items(), reverse=True)[:50]), # Top 50 bid buckets
|
|
'ask_buckets': dict(sorted(ask_buckets.items())[:50]), # Top 50 ask buckets
|
|
'imbalance': current_imbalance, # Current 1s imbalance
|
|
'total_volume': total_volume,
|
|
'stats': {
|
|
'avg_mid_price': avg_mid_price,
|
|
'avg_spread_bps': avg_spread,
|
|
'avg_imbalance': avg_imbalance,
|
|
'current_imbalance': current_imbalance,
|
|
'total_bid_volume': total_bid_volume,
|
|
'total_ask_volume': total_ask_volume,
|
|
'total_volume': total_volume,
|
|
'bid_bucket_count': len(bid_buckets),
|
|
'ask_bucket_count': len(ask_buckets),
|
|
'price_range_usd': max(max(bid_buckets.keys()) if bid_buckets else 0,
|
|
max(ask_buckets.keys()) if ask_buckets else 0) -
|
|
min(min(bid_buckets.keys()) if bid_buckets else 0,
|
|
min(ask_buckets.keys()) if ask_buckets else 0)
|
|
}
|
|
}
|
|
|
|
return aggregated
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating 1s COB aggregation: {e}")
|
|
return {}
|
|
|
|
def _ensure_datetime_index(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Ensure dataframe has proper datetime index"""
|
|
if df is None or df.empty:
|
|
return df
|
|
|
|
try:
|
|
# If we already have a proper DatetimeIndex, return as is
|
|
if isinstance(df.index, pd.DatetimeIndex):
|
|
return df
|
|
|
|
# If timestamp column exists, use it as index
|
|
if 'timestamp' in df.columns:
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
|
|
df.set_index('timestamp', inplace=True)
|
|
return df
|
|
|
|
# If we have a RangeIndex or other non-datetime index, create datetime index
|
|
if isinstance(df.index, pd.RangeIndex) or not isinstance(df.index, pd.DatetimeIndex):
|
|
# Use current UTC time and work backwards for realistic timestamps
|
|
from datetime import datetime, timedelta
|
|
end_time = datetime.utcnow()
|
|
start_time = end_time - timedelta(minutes=len(df))
|
|
df.index = pd.date_range(start=start_time, end=end_time, periods=len(df), tz='UTC')
|
|
logger.debug(f"Converted RangeIndex to DatetimeIndex for {len(df)} records")
|
|
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error ensuring datetime index: {e}")
|
|
return df
|
|
|
|
def get_historical_data(self, symbol: str, timeframe: str, limit: int = 1000, refresh: bool = False) -> Optional[pd.DataFrame]:
|
|
"""Get historical OHLCV data from cache only - no external API calls"""
|
|
try:
|
|
# Only return cached data - never trigger external API calls
|
|
if symbol in self.cached_data and timeframe in self.cached_data[symbol]:
|
|
cached_df = self.cached_data[symbol][timeframe]
|
|
if not cached_df.empty:
|
|
# Return requested amount from cached data
|
|
return cached_df.tail(limit)
|
|
|
|
logger.warning(f"No cached data available for {symbol} {timeframe}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting cached data for {symbol} {timeframe}: {e}")
|
|
return None
|
|
|
|
|
|
|
|
def _fetch_from_mexc(self, symbol: str, timeframe: str, limit: int) -> Optional[pd.DataFrame]:
|
|
"""Fetch data from MEXC API (fallback data source when Binance is unavailable)"""
|
|
try:
|
|
# MEXC doesn't support 1s intervals
|
|
if timeframe == '1s':
|
|
logger.warning(f"MEXC doesn't support 1s intervals, skipping {symbol}")
|
|
return None
|
|
|
|
# Convert symbol format
|
|
mexc_symbol = symbol.replace('/', '').upper()
|
|
|
|
# Convert timeframe for MEXC (excluding 1s)
|
|
timeframe_map = {
|
|
'1m': '1m', '5m': '5m', '15m': '15m', '30m': '30m',
|
|
'1h': '1h', '4h': '4h', '1d': '1d'
|
|
}
|
|
mexc_timeframe = timeframe_map.get(timeframe)
|
|
|
|
if mexc_timeframe is None:
|
|
logger.warning(f"MEXC doesn't support timeframe {timeframe}, skipping {symbol}")
|
|
return None
|
|
|
|
# MEXC API request
|
|
url = "https://api.mexc.com/api/v3/klines"
|
|
params = {
|
|
'symbol': mexc_symbol,
|
|
'interval': mexc_timeframe,
|
|
'limit': limit
|
|
}
|
|
|
|
response = requests.get(url, params=params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# Convert to DataFrame (MEXC uses 8 columns vs Binance's 12)
|
|
df = pd.DataFrame(data, columns=[
|
|
'timestamp', 'open', 'high', 'low', 'close', 'volume',
|
|
'close_time', 'quote_volume'
|
|
])
|
|
|
|
# Process columns with proper timezone handling (MEXC returns UTC timestamps)
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
|
|
for col in ['open', 'high', 'low', 'close', 'volume']:
|
|
df[col] = df[col].astype(float)
|
|
|
|
# Keep only OHLCV columns
|
|
df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]
|
|
df = df.sort_values('timestamp').reset_index(drop=True)
|
|
|
|
logger.info(f"MEXC: Fetched {len(df)} candles for {symbol} {timeframe}")
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"MEXC: Error fetching data: {e}")
|
|
return None
|
|
|
|
def _fetch_from_binance(self, symbol: str, timeframe: str, limit: int) -> Optional[pd.DataFrame]:
|
|
"""Fetch data from Binance API with robust rate limiting and error handling"""
|
|
try:
|
|
from .api_rate_limiter import get_rate_limiter
|
|
|
|
# Convert symbol format
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
|
|
# Convert timeframe (now includes 1s support)
|
|
timeframe_map = {
|
|
'1s': '1s', '1m': '1m', '5m': '5m', '15m': '15m', '30m': '30m',
|
|
'1h': '1h', '4h': '4h', '1d': '1d'
|
|
}
|
|
binance_timeframe = timeframe_map.get(timeframe, '1h')
|
|
|
|
# Use rate limiter for API requests
|
|
rate_limiter = get_rate_limiter()
|
|
|
|
# Check if we can make request
|
|
can_request, wait_time = rate_limiter.can_make_request('binance_api')
|
|
if not can_request:
|
|
logger.debug(f"Binance rate limited, waiting {wait_time:.1f}s for {symbol} {timeframe}")
|
|
if wait_time > 30: # If wait is too long, use fallback
|
|
return self._get_fallback_data(symbol, timeframe, limit)
|
|
time.sleep(min(wait_time, 5)) # Cap wait at 5 seconds
|
|
|
|
# API request with rate limiter
|
|
url = "https://api.binance.com/api/v3/klines"
|
|
params = {
|
|
'symbol': binance_symbol,
|
|
'interval': binance_timeframe,
|
|
'limit': limit
|
|
}
|
|
|
|
response = rate_limiter.make_request('binance_api', url, 'GET', params=params)
|
|
|
|
if response is None:
|
|
logger.warning(f"Binance API request failed for {symbol} {timeframe} - using fallback")
|
|
return self._get_fallback_data(symbol, timeframe, limit)
|
|
|
|
if response.status_code != 200:
|
|
logger.warning(f"Binance API returned {response.status_code} for {symbol} {timeframe}")
|
|
return self._get_fallback_data(symbol, timeframe, limit)
|
|
|
|
data = response.json()
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data, columns=[
|
|
'timestamp', 'open', 'high', 'low', 'close', 'volume',
|
|
'close_time', 'quote_volume', 'trades', 'taker_buy_base',
|
|
'taker_buy_quote', 'ignore'
|
|
])
|
|
|
|
# Process columns with proper timezone handling (Binance returns UTC timestamps)
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)
|
|
for col in ['open', 'high', 'low', 'close', 'volume']:
|
|
df[col] = df[col].astype(float)
|
|
|
|
# Keep only OHLCV columns
|
|
df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]
|
|
df = df.sort_values('timestamp').reset_index(drop=True)
|
|
|
|
logger.info(f"Binance: Fetched {len(df)} candles for {symbol} {timeframe}")
|
|
return df
|
|
|
|
except Exception as e:
|
|
if "451" in str(e) or "Client Error" in str(e):
|
|
logger.warning(f"Binance API access blocked (451) for {symbol} {timeframe} - using fallback")
|
|
return self._get_fallback_data(symbol, timeframe, limit)
|
|
else:
|
|
logger.error(f"Error fetching from Binance API: {e}")
|
|
return self._get_fallback_data(symbol, timeframe, limit)
|
|
|
|
def _get_fallback_data(self, symbol: str, timeframe: str, limit: int) -> Optional[pd.DataFrame]:
|
|
"""Get fallback data when Binance API is unavailable - REAL DATA ONLY"""
|
|
try:
|
|
logger.info(f"FALLBACK: Attempting to get real cached data for {symbol} {timeframe}")
|
|
|
|
# ONLY try cached data
|
|
cached_data = self._load_from_cache(symbol, timeframe)
|
|
if cached_data is not None and not cached_data.empty:
|
|
# Limit to requested amount
|
|
limited_data = cached_data.tail(limit) if len(cached_data) > limit else cached_data
|
|
logger.info(f"FALLBACK: Using cached real data for {symbol} {timeframe}: {len(limited_data)} bars")
|
|
return limited_data
|
|
|
|
# Try MEXC as secondary real data source
|
|
mexc_data = self._fetch_from_mexc(symbol, timeframe, limit)
|
|
if mexc_data is not None and not mexc_data.empty:
|
|
logger.info(f"FALLBACK: Using MEXC real data for {symbol} {timeframe}: {len(mexc_data)} bars")
|
|
return mexc_data
|
|
|
|
# NO SYNTHETIC DATA - Return None if no real data available
|
|
logger.warning(f"FALLBACK: No real data available for {symbol} {timeframe} - waiting for real data")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting fallback data: {e}")
|
|
return None
|
|
|
|
def _add_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Add comprehensive technical indicators AND pivot-based normalization context"""
|
|
try:
|
|
df = df.copy()
|
|
|
|
# Ensure we have enough data for indicators
|
|
if len(df) < 50:
|
|
logger.warning(f"Insufficient data for comprehensive indicators: {len(df)} rows")
|
|
return self._add_basic_indicators(df)
|
|
|
|
# === EXISTING TECHNICAL INDICATORS ===
|
|
# Moving averages (multiple timeframes)
|
|
df['sma_10'] = ta.trend.sma_indicator(df['close'], window=10)
|
|
df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
|
|
df['sma_50'] = ta.trend.sma_indicator(df['close'], window=50)
|
|
df['ema_12'] = ta.trend.ema_indicator(df['close'], window=12)
|
|
df['ema_26'] = ta.trend.ema_indicator(df['close'], window=26)
|
|
df['ema_50'] = ta.trend.ema_indicator(df['close'], window=50)
|
|
|
|
# MACD family
|
|
macd = ta.trend.MACD(df['close'])
|
|
df['macd'] = macd.macd()
|
|
df['macd_signal'] = macd.macd_signal()
|
|
df['macd_histogram'] = macd.macd_diff()
|
|
|
|
# ADX (Average Directional Index)
|
|
adx = ta.trend.ADXIndicator(df['high'], df['low'], df['close'])
|
|
df['adx'] = adx.adx()
|
|
df['adx_pos'] = adx.adx_pos()
|
|
df['adx_neg'] = adx.adx_neg()
|
|
|
|
# Parabolic SAR
|
|
psar = ta.trend.PSARIndicator(df['high'], df['low'], df['close'])
|
|
df['psar'] = psar.psar()
|
|
|
|
# === MOMENTUM INDICATORS ===
|
|
# RSI (multiple periods)
|
|
df['rsi_14'] = ta.momentum.rsi(df['close'], window=14)
|
|
df['rsi_7'] = ta.momentum.rsi(df['close'], window=7)
|
|
df['rsi_21'] = ta.momentum.rsi(df['close'], window=21)
|
|
|
|
# Stochastic Oscillator
|
|
stoch = ta.momentum.StochasticOscillator(df['high'], df['low'], df['close'])
|
|
df['stoch_k'] = stoch.stoch()
|
|
df['stoch_d'] = stoch.stoch_signal()
|
|
|
|
# Williams %R
|
|
df['williams_r'] = ta.momentum.williams_r(df['high'], df['low'], df['close'])
|
|
|
|
# Ultimate Oscillator (instead of CCI which isn't available)
|
|
df['ultimate_osc'] = ta.momentum.ultimate_oscillator(df['high'], df['low'], df['close'])
|
|
|
|
# === VOLATILITY INDICATORS ===
|
|
# Bollinger Bands
|
|
bollinger = ta.volatility.BollingerBands(df['close'])
|
|
df['bb_upper'] = bollinger.bollinger_hband()
|
|
df['bb_lower'] = bollinger.bollinger_lband()
|
|
df['bb_middle'] = bollinger.bollinger_mavg()
|
|
df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
|
|
df['bb_percent'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
|
|
|
|
# Average True Range
|
|
df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'])
|
|
|
|
# Keltner Channels
|
|
keltner = ta.volatility.KeltnerChannel(df['high'], df['low'], df['close'])
|
|
df['keltner_upper'] = keltner.keltner_channel_hband()
|
|
df['keltner_lower'] = keltner.keltner_channel_lband()
|
|
df['keltner_middle'] = keltner.keltner_channel_mband()
|
|
|
|
# === VOLUME INDICATORS ===
|
|
# Volume moving averages
|
|
df['volume_sma_10'] = df['volume'].rolling(window=10).mean()
|
|
df['volume_sma_20'] = df['volume'].rolling(window=20).mean()
|
|
df['volume_sma_50'] = df['volume'].rolling(window=50).mean()
|
|
|
|
# On Balance Volume
|
|
df['obv'] = ta.volume.on_balance_volume(df['close'], df['volume'])
|
|
|
|
# Volume Price Trend
|
|
df['vpt'] = ta.volume.volume_price_trend(df['close'], df['volume'])
|
|
|
|
# Money Flow Index
|
|
df['mfi'] = ta.volume.money_flow_index(df['high'], df['low'], df['close'], df['volume'])
|
|
|
|
# Accumulation/Distribution Line
|
|
df['ad_line'] = ta.volume.acc_dist_index(df['high'], df['low'], df['close'], df['volume'])
|
|
|
|
# Volume Weighted Average Price (VWAP)
|
|
df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
|
|
|
|
# === PRICE ACTION INDICATORS ===
|
|
# Price position relative to range
|
|
df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])
|
|
|
|
# True Range (use ATR calculation for true range)
|
|
df['true_range'] = df['atr'] # ATR is based on true range, so use it directly
|
|
|
|
# Rate of Change
|
|
df['roc'] = ta.momentum.roc(df['close'], window=10)
|
|
|
|
# === CUSTOM INDICATORS ===
|
|
# Trend strength (combination of multiple trend indicators)
|
|
df['trend_strength'] = (
|
|
(df['close'] > df['sma_20']).astype(int) +
|
|
(df['sma_10'] > df['sma_20']).astype(int) +
|
|
(df['macd'] > df['macd_signal']).astype(int) +
|
|
(df['adx'] > 25).astype(int)
|
|
) / 4.0
|
|
|
|
# Momentum composite
|
|
df['momentum_composite'] = (
|
|
(df['rsi_14'] / 100) +
|
|
((df['stoch_k'] + 50) / 100) + # Normalize stoch_k
|
|
((df['williams_r'] + 50) / 100) # Normalize williams_r
|
|
) / 3.0
|
|
|
|
# Volatility regime
|
|
df['volatility_regime'] = (df['atr'] / df['close']).rolling(window=20).rank(pct=True)
|
|
|
|
# === WILLIAMS MARKET STRUCTURE PIVOT CONTEXT ===
|
|
# Check if we need to refresh pivot bounds for this symbol
|
|
symbol = self._extract_symbol_from_dataframe(df)
|
|
if symbol and self._should_refresh_pivot_bounds(symbol):
|
|
logger.info(f"Refreshing pivot bounds for {symbol}")
|
|
self._refresh_pivot_bounds_for_symbol(symbol)
|
|
|
|
# Add pivot-based context features
|
|
if symbol and symbol in self.pivot_bounds:
|
|
df = self._add_pivot_context_features(df, symbol)
|
|
|
|
# === FILL NaN VALUES ===
|
|
# Forward fill first, then backward fill, then zero fill
|
|
df = df.ffill().bfill().fillna(0)
|
|
|
|
logger.debug(f"Added technical indicators + pivot context for {len(df)} rows")
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding comprehensive technical indicators: {e}")
|
|
# Fallback to basic indicators
|
|
return self._add_basic_indicators(df)
|
|
|
|
# === WILLIAMS MARKET STRUCTURE PIVOT SYSTEM ===
|
|
|
|
def _collect_monthly_1m_data(self, symbol: str) -> Optional[pd.DataFrame]:
|
|
"""Collect 30 days of 1m candles with smart gap-filling cache system"""
|
|
try:
|
|
# Check for cached data and determine what we need to fetch
|
|
cached_data = self._load_monthly_data_from_cache(symbol)
|
|
|
|
end_time = datetime.utcnow()
|
|
start_time = end_time - timedelta(days=30)
|
|
|
|
if cached_data is not None and not cached_data.empty:
|
|
logger.info(f"Found cached monthly 1m data for {symbol}: {len(cached_data)} candles")
|
|
|
|
# Check cache data range
|
|
cache_start = cached_data['timestamp'].min()
|
|
cache_end = cached_data['timestamp'].max()
|
|
|
|
logger.info(f"Cache range: {cache_start} to {cache_end}")
|
|
|
|
# Remove data older than 30 days
|
|
cached_data = cached_data[cached_data['timestamp'] >= start_time]
|
|
|
|
# Check if we need to fill gaps
|
|
gap_start = cache_end + timedelta(minutes=1)
|
|
|
|
if gap_start < end_time:
|
|
# Need to fill gap from cache_end to now
|
|
logger.info(f"Filling gap from {gap_start} to {end_time}")
|
|
gap_data = self._fetch_1m_data_range(symbol, gap_start, end_time)
|
|
|
|
if gap_data is not None and not gap_data.empty:
|
|
# Combine cached data with gap data
|
|
monthly_df = pd.concat([cached_data, gap_data], ignore_index=True)
|
|
monthly_df = monthly_df.sort_values('timestamp').drop_duplicates(subset=['timestamp']).reset_index(drop=True)
|
|
logger.info(f"Combined cache + gap: {len(monthly_df)} total candles")
|
|
else:
|
|
monthly_df = cached_data
|
|
logger.info(f"Using cached data only: {len(monthly_df)} candles")
|
|
else:
|
|
monthly_df = cached_data
|
|
logger.info(f"Cache is up to date: {len(monthly_df)} candles")
|
|
else:
|
|
# No cache - fetch full 30 days
|
|
logger.info(f"No cache found, collecting full 30 days of 1m data for {symbol}")
|
|
monthly_df = self._fetch_1m_data_range(symbol, start_time, end_time)
|
|
|
|
if monthly_df is not None and not monthly_df.empty:
|
|
# Final cleanup: ensure exactly 30 days
|
|
monthly_df = monthly_df[monthly_df['timestamp'] >= start_time]
|
|
monthly_df = monthly_df.sort_values('timestamp').reset_index(drop=True)
|
|
|
|
logger.info(f"Final dataset: {len(monthly_df)} 1m candles for {symbol}")
|
|
|
|
# Update cache
|
|
self._save_monthly_data_to_cache(symbol, monthly_df)
|
|
|
|
return monthly_df
|
|
else:
|
|
logger.error(f"No monthly 1m data collected for {symbol}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting monthly 1m data for {symbol}: {e}")
|
|
return None
|
|
|
|
def _fetch_1s_batch_with_endtime(self, symbol: str, end_time: datetime, limit: int = 1000) -> Optional[pd.DataFrame]:
|
|
"""Fetch a batch of 1s candles ending at specific time"""
|
|
try:
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
|
|
# Convert end_time to milliseconds
|
|
end_ms = int(end_time.timestamp() * 1000)
|
|
|
|
# API request
|
|
url = "https://api.binance.com/api/v3/klines"
|
|
params = {
|
|
'symbol': binance_symbol,
|
|
'interval': '1s',
|
|
'endTime': end_ms,
|
|
'limit': limit
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
if not data:
|
|
return None
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data, columns=[
|
|
'timestamp', 'open', 'high', 'low', 'close', 'volume',
|
|
'close_time', 'quote_volume', 'trades', 'taker_buy_base',
|
|
'taker_buy_quote', 'ignore'
|
|
])
|
|
|
|
# Process columns
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
|
|
for col in ['open', 'high', 'low', 'close', 'volume']:
|
|
df[col] = df[col].astype(float)
|
|
|
|
# Keep only OHLCV columns
|
|
df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]
|
|
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching 1s batch for {symbol}: {e}")
|
|
return None
|
|
|
|
def _fetch_1m_data_range(self, symbol: str, start_time: datetime, end_time: datetime) -> Optional[pd.DataFrame]:
|
|
"""Fetch 1m candles for a specific time range with efficient batching"""
|
|
try:
|
|
# Convert symbol format for Binance API
|
|
if '/' in symbol:
|
|
api_symbol = symbol.replace('/', '')
|
|
else:
|
|
api_symbol = symbol
|
|
|
|
logger.info(f"Fetching 1m data for {symbol} from {start_time} to {end_time}")
|
|
|
|
all_candles = []
|
|
current_start = start_time
|
|
batch_size = 1000 # Binance limit
|
|
api_calls_made = 0
|
|
|
|
while current_start < end_time and api_calls_made < 50: # Safety limit for 30 days
|
|
try:
|
|
# Calculate end time for this batch
|
|
batch_end = min(current_start + timedelta(minutes=batch_size), end_time)
|
|
|
|
# Convert to milliseconds
|
|
start_timestamp = int(current_start.timestamp() * 1000)
|
|
end_timestamp = int(batch_end.timestamp() * 1000)
|
|
|
|
# Binance API call
|
|
url = "https://api.binance.com/api/v3/klines"
|
|
params = {
|
|
'symbol': api_symbol,
|
|
'interval': '1m',
|
|
'startTime': start_timestamp,
|
|
'endTime': end_timestamp,
|
|
'limit': batch_size
|
|
}
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
response = requests.get(url, params=params, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
api_calls_made += 1
|
|
|
|
if not data:
|
|
logger.warning(f"No data returned for batch {current_start} to {batch_end}")
|
|
break
|
|
|
|
# Convert to DataFrame
|
|
batch_df = pd.DataFrame(data, columns=[
|
|
'timestamp', 'open', 'high', 'low', 'close', 'volume',
|
|
'close_time', 'quote_volume', 'trades', 'taker_buy_base',
|
|
'taker_buy_quote', 'ignore'
|
|
])
|
|
|
|
# Process columns
|
|
batch_df['timestamp'] = pd.to_datetime(batch_df['timestamp'], unit='ms')
|
|
for col in ['open', 'high', 'low', 'close', 'volume']:
|
|
batch_df[col] = batch_df[col].astype(float)
|
|
|
|
# Keep only OHLCV columns
|
|
batch_df = batch_df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]
|
|
|
|
all_candles.append(batch_df)
|
|
|
|
# Move to next batch (add 1 minute to avoid overlap)
|
|
current_start = batch_end + timedelta(minutes=1)
|
|
|
|
# Rate limiting (Binance allows 1200/min)
|
|
time.sleep(0.05) # 50ms delay
|
|
|
|
# Progress logging
|
|
if api_calls_made % 10 == 0:
|
|
total_candles = sum(len(df) for df in all_candles)
|
|
logger.info(f"Progress: {api_calls_made} API calls, {total_candles} candles collected")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in batch {current_start} to {batch_end}: {e}")
|
|
current_start += timedelta(minutes=batch_size)
|
|
time.sleep(1) # Wait longer on error
|
|
continue
|
|
|
|
if not all_candles:
|
|
logger.error(f"No data collected for {symbol}")
|
|
return None
|
|
|
|
# Combine all batches
|
|
df = pd.concat(all_candles, ignore_index=True)
|
|
df = df.sort_values('timestamp').drop_duplicates(subset=['timestamp']).reset_index(drop=True)
|
|
|
|
logger.info(f"Successfully fetched {len(df)} 1m candles for {symbol} ({api_calls_made} API calls)")
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching 1m data range for {symbol}: {e}")
|
|
return None
|
|
|
|
def _extract_pivot_bounds_from_monthly_data(self, symbol: str, monthly_data: pd.DataFrame) -> Optional[PivotBounds]:
|
|
"""Extract pivot bounds using Williams Market Structure analysis"""
|
|
try:
|
|
logger.info(f"Analyzing {len(monthly_data)} candles for pivot extraction...")
|
|
|
|
# Convert DataFrame to numpy array format expected by Williams Market Structure
|
|
ohlcv_array = monthly_data[['timestamp', 'open', 'high', 'low', 'close', 'volume']].copy()
|
|
|
|
# Convert timestamp to numeric for Williams analysis
|
|
ohlcv_array['timestamp'] = ohlcv_array['timestamp'].astype(np.int64) // 10**9 # Convert to seconds
|
|
ohlcv_array = ohlcv_array.to_numpy()
|
|
|
|
# Initialize Williams Market Structure analyzer
|
|
try:
|
|
from training.williams_market_structure import WilliamsMarketStructure
|
|
|
|
williams = WilliamsMarketStructure(
|
|
swing_strengths=[2, 3, 5, 8], # Multi-strength pivot detection
|
|
enable_cnn_feature=False # We just want pivot data, not CNN training
|
|
)
|
|
|
|
# Calculate 5 levels of recursive pivot points
|
|
logger.info("Running Williams Market Structure analysis...")
|
|
pivot_levels = williams.calculate_recursive_pivot_points(ohlcv_array)
|
|
|
|
except ImportError:
|
|
logger.warning("Williams Market Structure not available, using simplified pivot detection")
|
|
pivot_levels = self._simple_pivot_detection(monthly_data)
|
|
|
|
# Extract bounds from pivot analysis
|
|
bounds = self._extract_bounds_from_pivot_levels(symbol, monthly_data, pivot_levels)
|
|
|
|
return bounds
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting pivot bounds for {symbol}: {e}")
|
|
return None
|
|
|
|
def _extract_bounds_from_pivot_levels(self, symbol: str, monthly_data: pd.DataFrame,
|
|
pivot_levels: Dict[str, Any]) -> PivotBounds:
|
|
"""Extract normalization bounds from Williams pivot levels"""
|
|
try:
|
|
# Initialize bounds
|
|
price_max = monthly_data['high'].max()
|
|
price_min = monthly_data['low'].min()
|
|
volume_max = monthly_data['volume'].max()
|
|
volume_min = monthly_data['volume'].min()
|
|
|
|
support_levels = []
|
|
resistance_levels = []
|
|
|
|
# Extract pivot points from all Williams levels
|
|
for level_key, level_data in pivot_levels.items():
|
|
if level_data and hasattr(level_data, 'swing_points') and level_data.swing_points:
|
|
# Get prices from swing points
|
|
level_prices = [sp.price for sp in level_data.swing_points]
|
|
|
|
# Update overall price bounds
|
|
price_max = max(price_max, max(level_prices))
|
|
price_min = min(price_min, min(level_prices))
|
|
|
|
# Extract support and resistance levels
|
|
if hasattr(level_data, 'support_levels') and level_data.support_levels:
|
|
support_levels.extend(level_data.support_levels)
|
|
|
|
if hasattr(level_data, 'resistance_levels') and level_data.resistance_levels:
|
|
resistance_levels.extend(level_data.resistance_levels)
|
|
|
|
# Remove duplicates and sort
|
|
support_levels = sorted(list(set(support_levels)))
|
|
resistance_levels = sorted(list(set(resistance_levels)))
|
|
|
|
# Create PivotBounds object
|
|
bounds = PivotBounds(
|
|
symbol=symbol,
|
|
price_max=float(price_max),
|
|
price_min=float(price_min),
|
|
volume_max=float(volume_max),
|
|
volume_min=float(volume_min),
|
|
pivot_support_levels=support_levels,
|
|
pivot_resistance_levels=resistance_levels,
|
|
pivot_context=pivot_levels,
|
|
created_timestamp=datetime.utcnow(),
|
|
data_period_start=monthly_data['timestamp'].min(),
|
|
data_period_end=monthly_data['timestamp'].max(),
|
|
total_candles_analyzed=len(monthly_data)
|
|
)
|
|
|
|
logger.info(f"Extracted pivot bounds for {symbol}:")
|
|
logger.info(f" Price range: ${bounds.price_min:.2f} - ${bounds.price_max:.2f}")
|
|
logger.info(f" Volume range: {bounds.volume_min:.2f} - {bounds.volume_max:.2f}")
|
|
logger.info(f" Support levels: {len(bounds.pivot_support_levels)}")
|
|
logger.info(f" Resistance levels: {len(bounds.pivot_resistance_levels)}")
|
|
|
|
return bounds
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting bounds from pivot levels: {e}")
|
|
# Fallback to simple min/max bounds
|
|
return PivotBounds(
|
|
symbol=symbol,
|
|
price_max=float(monthly_data['high'].max()),
|
|
price_min=float(monthly_data['low'].min()),
|
|
volume_max=float(monthly_data['volume'].max()),
|
|
volume_min=float(monthly_data['volume'].min()),
|
|
pivot_support_levels=[],
|
|
pivot_resistance_levels=[],
|
|
pivot_context={},
|
|
created_timestamp=datetime.now(),
|
|
data_period_start=monthly_data['timestamp'].min(),
|
|
data_period_end=monthly_data['timestamp'].max(),
|
|
total_candles_analyzed=len(monthly_data)
|
|
)
|
|
|
|
def _simple_pivot_detection(self, monthly_data: pd.DataFrame) -> Dict[str, Any]:
|
|
"""Simple pivot detection fallback when Williams Market Structure is not available"""
|
|
try:
|
|
# Simple high/low pivot detection using rolling windows
|
|
highs = monthly_data['high']
|
|
lows = monthly_data['low']
|
|
|
|
# Find local maxima and minima using different windows
|
|
pivot_highs = []
|
|
pivot_lows = []
|
|
|
|
for window in [5, 10, 20, 50]:
|
|
if len(monthly_data) > window * 2:
|
|
# Rolling max/min detection
|
|
rolling_max = highs.rolling(window=window, center=True).max()
|
|
rolling_min = lows.rolling(window=window, center=True).min()
|
|
|
|
# Find pivot highs (local maxima)
|
|
high_pivots = monthly_data[highs == rolling_max]['high'].tolist()
|
|
pivot_highs.extend(high_pivots)
|
|
|
|
# Find pivot lows (local minima)
|
|
low_pivots = monthly_data[lows == rolling_min]['low'].tolist()
|
|
pivot_lows.extend(low_pivots)
|
|
|
|
# Create mock level structure
|
|
mock_level = type('MockLevel', (), {
|
|
'swing_points': [],
|
|
'support_levels': list(set(pivot_lows)),
|
|
'resistance_levels': list(set(pivot_highs))
|
|
})()
|
|
|
|
return {'level_0': mock_level}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in simple pivot detection: {e}")
|
|
return {}
|
|
|
|
def _should_refresh_pivot_bounds(self, symbol: str) -> bool:
|
|
"""Check if pivot bounds need refreshing"""
|
|
try:
|
|
if symbol not in self.pivot_bounds:
|
|
return True
|
|
|
|
bounds = self.pivot_bounds[symbol]
|
|
age = datetime.now() - bounds.created_timestamp
|
|
|
|
return age > self.pivot_refresh_interval
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking pivot bounds refresh: {e}")
|
|
return True
|
|
|
|
def _refresh_pivot_bounds_for_symbol(self, symbol: str):
|
|
"""Refresh pivot bounds for a specific symbol"""
|
|
try:
|
|
# Collect monthly 1m data
|
|
monthly_data = self._collect_monthly_1m_data(symbol)
|
|
if monthly_data is None or monthly_data.empty:
|
|
logger.warning(f"Could not collect monthly data for {symbol}")
|
|
return
|
|
|
|
# Extract pivot bounds
|
|
bounds = self._extract_pivot_bounds_from_monthly_data(symbol, monthly_data)
|
|
if bounds is None:
|
|
logger.warning(f"Could not extract pivot bounds for {symbol}")
|
|
return
|
|
|
|
# Store bounds
|
|
self.pivot_bounds[symbol] = bounds
|
|
|
|
# Save to cache
|
|
self._save_pivot_bounds_to_cache(symbol, bounds)
|
|
|
|
logger.info(f"Successfully refreshed pivot bounds for {symbol}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error refreshing pivot bounds for {symbol}: {e}")
|
|
|
|
def _add_pivot_context_features(self, df: pd.DataFrame, symbol: str) -> pd.DataFrame:
|
|
"""Add pivot-derived context features for normalization"""
|
|
try:
|
|
if symbol not in self.pivot_bounds:
|
|
return df
|
|
|
|
bounds = self.pivot_bounds[symbol]
|
|
current_prices = df['close']
|
|
|
|
# Distance to nearest support/resistance levels (normalized)
|
|
df['pivot_support_distance'] = current_prices.apply(bounds.get_nearest_support_distance)
|
|
df['pivot_resistance_distance'] = current_prices.apply(bounds.get_nearest_resistance_distance)
|
|
|
|
# Price position within pivot range (0 = price_min, 1 = price_max)
|
|
df['pivot_price_position'] = current_prices.apply(bounds.normalize_price).clip(0, 1)
|
|
|
|
# Add binary features for proximity to key levels
|
|
price_range = bounds.get_price_range()
|
|
proximity_threshold = price_range * 0.02 # 2% of price range
|
|
|
|
df['near_pivot_support'] = 0
|
|
df['near_pivot_resistance'] = 0
|
|
|
|
for price in current_prices:
|
|
# Check if near any support level
|
|
if any(abs(price - s) <= proximity_threshold for s in bounds.pivot_support_levels):
|
|
df.loc[df['close'] == price, 'near_pivot_support'] = 1
|
|
|
|
# Check if near any resistance level
|
|
if any(abs(price - r) <= proximity_threshold for r in bounds.pivot_resistance_levels):
|
|
df.loc[df['close'] == price, 'near_pivot_resistance'] = 1
|
|
|
|
logger.debug(f"Added pivot context features for {symbol}")
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error adding pivot context features for {symbol}: {e}")
|
|
return df
|
|
|
|
def _extract_symbol_from_dataframe(self, df: pd.DataFrame) -> Optional[str]:
|
|
"""Extract symbol from dataframe context (basic implementation)"""
|
|
# This is a simple implementation - in a real system, you might pass symbol explicitly
|
|
# or store it as metadata in the dataframe
|
|
for symbol in self.symbols:
|
|
# Check if this dataframe might belong to this symbol based on current processing
|
|
return symbol # Return first symbol for now - can be improved
|
|
return None
|
|
|
|
# === CACHE MANAGEMENT ===
|
|
|
|
def _auto_fix_corrupted_cache(self):
|
|
"""Automatically fix corrupted cache files on startup"""
|
|
try:
|
|
from utils.cache_manager import get_cache_manager
|
|
cache_manager = get_cache_manager()
|
|
|
|
# Quick health check
|
|
health_summary = cache_manager.get_cache_summary()
|
|
|
|
if health_summary['corrupted_files'] > 0:
|
|
logger.warning(f"Found {health_summary['corrupted_files']} corrupted cache files, cleaning up...")
|
|
|
|
# Auto-cleanup corrupted files (no confirmation needed)
|
|
deleted_files = cache_manager.cleanup_corrupted_files(dry_run=False)
|
|
|
|
deleted_count = 0
|
|
for cache_dir, files in deleted_files.items():
|
|
for file_info in files:
|
|
if "DELETED:" in file_info:
|
|
deleted_count += 1
|
|
|
|
logger.info(f"Auto-cleaned {deleted_count} corrupted cache files")
|
|
else:
|
|
logger.debug("Cache health check passed - no corrupted files found")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Cache auto-fix failed: {e}")
|
|
|
|
# === PIVOT BOUNDS CACHING ===
|
|
|
|
def _load_all_pivot_bounds(self):
|
|
"""Load all cached pivot bounds on startup"""
|
|
try:
|
|
for symbol in self.symbols:
|
|
bounds = self._load_pivot_bounds_from_cache(symbol)
|
|
if bounds:
|
|
self.pivot_bounds[symbol] = bounds
|
|
logger.info(f"Loaded cached pivot bounds for {symbol}")
|
|
except Exception as e:
|
|
logger.error(f"Error loading pivot bounds from cache: {e}")
|
|
|
|
def _load_pivot_bounds_from_cache(self, symbol: str) -> Optional[PivotBounds]:
|
|
"""Load pivot bounds from cache"""
|
|
try:
|
|
cache_file = self.pivot_cache_dir / f"{symbol.replace('/', '')}_pivot_bounds.pkl"
|
|
if cache_file.exists():
|
|
with open(cache_file, 'rb') as f:
|
|
bounds = pickle.load(f)
|
|
|
|
# Check if bounds are still valid (not too old)
|
|
age = datetime.now() - bounds.created_timestamp
|
|
if age <= self.pivot_refresh_interval:
|
|
return bounds
|
|
else:
|
|
logger.info(f"Cached pivot bounds for {symbol} are too old ({age.days} days)")
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error loading pivot bounds from cache for {symbol}: {e}")
|
|
return None
|
|
|
|
def _save_pivot_bounds_to_cache(self, symbol: str, bounds: PivotBounds):
|
|
"""Save pivot bounds to cache"""
|
|
try:
|
|
cache_file = self.pivot_cache_dir / f"{symbol.replace('/', '')}_pivot_bounds.pkl"
|
|
with open(cache_file, 'wb') as f:
|
|
pickle.dump(bounds, f)
|
|
logger.debug(f"Saved pivot bounds to cache for {symbol}")
|
|
except Exception as e:
|
|
logger.warning(f"Error saving pivot bounds to cache for {symbol}: {e}")
|
|
|
|
def _load_monthly_data_from_cache(self, symbol: str) -> Optional[pd.DataFrame]:
|
|
"""Load monthly 1m data from cache"""
|
|
try:
|
|
cache_file = self.monthly_data_cache_dir / f"{symbol.replace('/', '')}_monthly_1m.parquet"
|
|
if cache_file.exists():
|
|
try:
|
|
df = pd.read_parquet(cache_file)
|
|
logger.info(f"Loaded {len(df)} 1m candles from cache for {symbol}")
|
|
return df
|
|
except Exception as parquet_e:
|
|
# Handle corrupted Parquet file - expanded error detection
|
|
error_str = str(parquet_e).lower()
|
|
corrupted_indicators = [
|
|
"parquet magic bytes not found",
|
|
"corrupted",
|
|
"couldn't deserialize thrift",
|
|
"don't know what type",
|
|
"invalid parquet file",
|
|
"unexpected end of file",
|
|
"invalid metadata"
|
|
]
|
|
|
|
if any(indicator in error_str for indicator in corrupted_indicators):
|
|
logger.warning(f"Corrupted Parquet cache file for {symbol}, removing and returning None: {parquet_e}")
|
|
try:
|
|
cache_file.unlink() # Delete corrupted file
|
|
logger.info(f"Deleted corrupted monthly cache file: {cache_file}")
|
|
except Exception as delete_e:
|
|
logger.error(f"Failed to delete corrupted monthly cache file: {delete_e}")
|
|
return None
|
|
else:
|
|
raise parquet_e
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error loading monthly data from cache for {symbol}: {e}")
|
|
return None
|
|
|
|
def _save_monthly_data_to_cache(self, symbol: str, df: pd.DataFrame):
|
|
"""Save monthly 1m data to cache"""
|
|
try:
|
|
cache_file = self.monthly_data_cache_dir / f"{symbol.replace('/', '')}_monthly_1m.parquet"
|
|
df.to_parquet(cache_file, index=False)
|
|
logger.info(f"Saved {len(df)} monthly 1m candles to cache for {symbol}")
|
|
except Exception as e:
|
|
logger.warning(f"Error saving monthly data to cache for {symbol}: {e}")
|
|
|
|
def get_pivot_bounds(self, symbol: str) -> Optional[PivotBounds]:
|
|
"""Get pivot bounds for a symbol"""
|
|
return self.pivot_bounds.get(symbol)
|
|
|
|
def get_pivot_normalized_features(self, symbol: str, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
|
"""Get dataframe with pivot-normalized features"""
|
|
try:
|
|
if symbol not in self.pivot_bounds:
|
|
logger.warning(f"No pivot bounds available for {symbol}")
|
|
return df
|
|
|
|
bounds = self.pivot_bounds[symbol]
|
|
normalized_df = df.copy()
|
|
|
|
# Normalize price columns using pivot bounds
|
|
price_range = bounds.get_price_range()
|
|
for col in ['open', 'high', 'low', 'close']:
|
|
if col in normalized_df.columns:
|
|
normalized_df[col] = (normalized_df[col] - bounds.price_min) / price_range
|
|
|
|
# Normalize volume using pivot bounds
|
|
volume_range = bounds.volume_max - bounds.volume_min
|
|
if volume_range > 0 and 'volume' in normalized_df.columns:
|
|
normalized_df['volume'] = (normalized_df['volume'] - bounds.volume_min) / volume_range
|
|
|
|
return normalized_df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying pivot normalization for {symbol}: {e}")
|
|
return df
|
|
|
|
def build_base_data_input(self, symbol: str) -> Optional['BaseDataInput']:
|
|
"""
|
|
Build BaseDataInput from cached data (optimized for speed)
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
BaseDataInput with consistent data structure
|
|
"""
|
|
try:
|
|
from .data_models import BaseDataInput
|
|
|
|
# Get OHLCV data directly from optimized cache (no validation checks for speed)
|
|
ohlcv_1s_list = self._get_cached_ohlcv_bars(symbol, '1s', 300)
|
|
ohlcv_1m_list = self._get_cached_ohlcv_bars(symbol, '1m', 300)
|
|
ohlcv_1h_list = self._get_cached_ohlcv_bars(symbol, '1h', 300)
|
|
ohlcv_1d_list = self._get_cached_ohlcv_bars(symbol, '1d', 300)
|
|
|
|
# Get BTC reference data
|
|
btc_symbol = 'BTC/USDT'
|
|
btc_ohlcv_1s_list = self._get_cached_ohlcv_bars(btc_symbol, '1s', 300)
|
|
if not btc_ohlcv_1s_list:
|
|
# Use ETH data as fallback
|
|
btc_ohlcv_1s_list = ohlcv_1s_list
|
|
|
|
# Get cached data (fast lookups)
|
|
technical_indicators = self._get_latest_technical_indicators(symbol)
|
|
cob_data = self._get_latest_cob_data_object(symbol)
|
|
last_predictions = {} # TODO: Implement model prediction caching
|
|
|
|
# Build BaseDataInput (no validation for speed - assume data is good)
|
|
base_data = BaseDataInput(
|
|
symbol=symbol,
|
|
timestamp=datetime.now(),
|
|
ohlcv_1s=ohlcv_1s_list,
|
|
ohlcv_1m=ohlcv_1m_list,
|
|
ohlcv_1h=ohlcv_1h_list,
|
|
ohlcv_1d=ohlcv_1d_list,
|
|
btc_ohlcv_1s=btc_ohlcv_1s_list,
|
|
technical_indicators=technical_indicators,
|
|
cob_data=cob_data,
|
|
last_predictions=last_predictions
|
|
)
|
|
|
|
return base_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error building BaseDataInput for {symbol}: {e}")
|
|
return None
|
|
|
|
def _get_cached_ohlcv_bars(self, symbol: str, timeframe: str, max_count: int) -> List['OHLCVBar']:
|
|
"""Get OHLCV data list from cached data"""
|
|
try:
|
|
from .data_models import OHLCVBar
|
|
data_list = []
|
|
|
|
# Get cached data
|
|
if symbol in self.cached_data and timeframe in self.cached_data[symbol]:
|
|
cached_df = self.cached_data[symbol][timeframe]
|
|
if not cached_df.empty:
|
|
# Convert cached data to OHLCVBar objects
|
|
for idx, row in cached_df.tail(max_count).iterrows():
|
|
bar = OHLCVBar(
|
|
symbol=symbol,
|
|
timestamp=idx if hasattr(idx, 'to_pydatetime') else datetime.now(),
|
|
open=float(row['open']),
|
|
high=float(row['high']),
|
|
low=float(row['low']),
|
|
close=float(row['close']),
|
|
volume=float(row['volume']),
|
|
timeframe=timeframe
|
|
)
|
|
data_list.append(bar)
|
|
|
|
return data_list
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting cached OHLCV bars for {symbol}/{timeframe}: {e}")
|
|
return []
|
|
|
|
def _get_latest_technical_indicators(self, symbol: str) -> Dict[str, float]:
|
|
"""Get latest technical indicators for a symbol"""
|
|
try:
|
|
# Get latest data and calculate indicators
|
|
df = self.get_historical_data(symbol, '1h', limit=50)
|
|
if df is not None and not df.empty:
|
|
df_with_indicators = self._add_technical_indicators(df)
|
|
if not df_with_indicators.empty:
|
|
# Return the latest indicators as a dict
|
|
latest_row = df_with_indicators.iloc[-1]
|
|
indicators = {}
|
|
for col in df_with_indicators.columns:
|
|
if col not in ['open', 'high', 'low', 'close', 'volume', 'timestamp']:
|
|
indicators[col] = float(latest_row[col]) if pd.notna(latest_row[col]) else 0.0
|
|
return indicators
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Error getting technical indicators for {symbol}: {e}")
|
|
return {}
|
|
|
|
def _get_latest_cob_data_object(self, symbol: str) -> Optional['COBData']:
|
|
"""Get latest COB data as COBData object"""
|
|
try:
|
|
from .data_models import COBData
|
|
|
|
# Get latest COB data from cache
|
|
cob_data = self.get_latest_cob_data(symbol)
|
|
if cob_data and 'current_price' in cob_data:
|
|
return COBData(
|
|
symbol=symbol,
|
|
timestamp=datetime.now(),
|
|
current_price=cob_data['current_price'],
|
|
bucket_size=1.0 if 'ETH' in symbol else 10.0,
|
|
price_buckets=cob_data.get('price_buckets', {}),
|
|
bid_ask_imbalance=cob_data.get('bid_ask_imbalance', {}),
|
|
volume_weighted_prices=cob_data.get('volume_weighted_prices', {}),
|
|
order_flow_metrics=cob_data.get('order_flow_metrics', {}),
|
|
ma_1s_imbalance=cob_data.get('ma_1s_imbalance', {}),
|
|
ma_5s_imbalance=cob_data.get('ma_5s_imbalance', {}),
|
|
ma_15s_imbalance=cob_data.get('ma_15s_imbalance', {}),
|
|
ma_60s_imbalance=cob_data.get('ma_60s_imbalance', {})
|
|
)
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB data object for {symbol}: {e}")
|
|
return None
|
|
|
|
|
|
|
|
def _add_basic_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Add basic indicators for small datasets"""
|
|
try:
|
|
df = df.copy()
|
|
|
|
# Basic moving averages
|
|
if len(df) >= 20:
|
|
df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
|
|
df['ema_12'] = ta.trend.ema_indicator(df['close'], window=12)
|
|
|
|
# Basic RSI
|
|
if len(df) >= 14:
|
|
df['rsi_14'] = ta.momentum.rsi(df['close'], window=14)
|
|
|
|
# Basic volume indicators
|
|
if len(df) >= 10:
|
|
df['volume_sma_10'] = df['volume'].rolling(window=10).mean()
|
|
|
|
# Basic price action
|
|
df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])
|
|
df['price_position'] = df['price_position'].fillna(0.5) # Default to middle
|
|
|
|
# Fill NaN values
|
|
df = df.ffill().bfill().fillna(0)
|
|
|
|
return df
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding basic indicators: {e}")
|
|
return df
|
|
|
|
def _load_from_cache(self, symbol: str, timeframe: str) -> Optional[pd.DataFrame]:
|
|
"""Load data from cache"""
|
|
try:
|
|
cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
|
|
if cache_file.exists():
|
|
# Check if cache is recent - stricter rules for startup
|
|
cache_age = time.time() - cache_file.stat().st_mtime
|
|
|
|
# For 1m data, use cache only if less than 5 minutes old to avoid gaps
|
|
if timeframe == '1m':
|
|
max_age = 300 # 5 minutes
|
|
else:
|
|
max_age = 3600 # 1 hour for other timeframes
|
|
|
|
if cache_age < max_age:
|
|
try:
|
|
df = pd.read_parquet(cache_file)
|
|
logger.debug(f"Loaded {len(df)} rows from cache for {symbol} {timeframe} (age: {cache_age/60:.1f}min)")
|
|
return df
|
|
except Exception as parquet_e:
|
|
# Handle corrupted Parquet file - expanded error detection
|
|
error_str = str(parquet_e).lower()
|
|
corrupted_indicators = [
|
|
"parquet magic bytes not found",
|
|
"corrupted",
|
|
"couldn't deserialize thrift",
|
|
"don't know what type",
|
|
"invalid parquet file",
|
|
"unexpected end of file",
|
|
"invalid metadata"
|
|
]
|
|
|
|
if any(indicator in error_str for indicator in corrupted_indicators):
|
|
logger.warning(f"Corrupted Parquet cache file for {symbol} {timeframe}, removing and returning None: {parquet_e}")
|
|
try:
|
|
cache_file.unlink() # Delete corrupted file
|
|
logger.info(f"Deleted corrupted cache file: {cache_file}")
|
|
except Exception as delete_e:
|
|
logger.error(f"Failed to delete corrupted cache file: {delete_e}")
|
|
return None
|
|
else:
|
|
raise parquet_e
|
|
else:
|
|
logger.debug(f"Cache for {symbol} {timeframe} is too old ({cache_age/60:.1f}min > {max_age/60:.1f}min)")
|
|
return None
|
|
except Exception as e:
|
|
logger.warning(f"Error loading cache for {symbol} {timeframe}: {e}")
|
|
return None
|
|
|
|
def _save_to_cache(self, df: pd.DataFrame, symbol: str, timeframe: str):
|
|
"""Save data to cache"""
|
|
try:
|
|
cache_file = self.cache_dir / f"{symbol.replace('/', '')}_{timeframe}.parquet"
|
|
df.to_parquet(cache_file, index=False)
|
|
logger.debug(f"Saved {len(df)} rows to cache for {symbol} {timeframe}")
|
|
except Exception as e:
|
|
logger.warning(f"Error saving cache for {symbol} {timeframe}: {e}")
|
|
|
|
async def start_real_time_streaming(self):
|
|
"""Start real-time data streaming using COBIntegration"""
|
|
if self.is_streaming:
|
|
logger.warning("Real-time streaming already active")
|
|
return
|
|
|
|
self.is_streaming = True
|
|
logger.info("Starting real-time streaming via COBIntegration")
|
|
# COBIntegration is started in the constructor
|
|
|
|
async def stop_real_time_streaming(self):
|
|
"""Stop real-time data streaming"""
|
|
if not self.is_streaming:
|
|
return
|
|
|
|
logger.info("Stopping Enhanced COB WebSocket streaming")
|
|
self.is_streaming = False
|
|
|
|
# Stop COB Integration
|
|
if self.cob_integration:
|
|
try:
|
|
await self.cob_integration.stop()
|
|
logger.info("COB Integration stopped")
|
|
except Exception as e:
|
|
logger.error(f"Error stopping COB Integration: {e}")
|
|
|
|
# Stop Enhanced COB WebSocket
|
|
if self.enhanced_cob_websocket:
|
|
try:
|
|
await self.enhanced_cob_websocket.stop()
|
|
self.enhanced_cob_websocket = None
|
|
logger.info("Enhanced COB WebSocket stopped")
|
|
except Exception as e:
|
|
logger.error(f"Error stopping Enhanced COB WebSocket: {e}")
|
|
|
|
# Cancel any remaining WebSocket tasks
|
|
for symbol, task in self.websocket_tasks.items():
|
|
if not task.done():
|
|
task.cancel()
|
|
try:
|
|
await task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
self.websocket_tasks.clear()
|
|
|
|
# === COB DATA ACCESS METHODS ===
|
|
|
|
def get_cob_raw_ticks(self, symbol: str, count: int = 1000) -> List[Dict]:
|
|
"""Get raw COB ticks for a symbol (up to 15 minutes of data)"""
|
|
try:
|
|
if symbol in self.cob_raw_ticks:
|
|
return list(self.cob_raw_ticks[symbol])[-count:]
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB raw ticks for {symbol}: {e}")
|
|
return []
|
|
|
|
def get_cob_1s_aggregated(self, symbol: str, count: int = 300) -> List[Dict]:
|
|
"""Get 1s aggregated COB data with $1 price buckets"""
|
|
try:
|
|
if symbol in self.cob_1s_aggregated:
|
|
return list(self.cob_1s_aggregated[symbol])[-count:]
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB 1s aggregated for {symbol}: {e}")
|
|
return []
|
|
|
|
def get_latest_cob_data(self, symbol: str) -> Optional[Dict]:
|
|
"""Get latest COB raw tick for a symbol"""
|
|
try:
|
|
if symbol in self.cob_raw_ticks and self.cob_raw_ticks[symbol]:
|
|
return self.cob_raw_ticks[symbol][-1]
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest COB data for {symbol}: {e}")
|
|
return None
|
|
|
|
def get_latest_cob_aggregated(self, symbol: str) -> Optional[Dict]:
|
|
"""Get latest 1s aggregated COB data for a symbol"""
|
|
try:
|
|
if symbol in self.cob_1s_aggregated and self.cob_1s_aggregated[symbol]:
|
|
return self.cob_1s_aggregated[symbol][-1]
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest COB aggregated for {symbol}: {e}")
|
|
return None
|
|
|
|
def subscribe_to_cob_raw_ticks(self, callback: Callable[[str, Dict], None]) -> str:
|
|
"""Subscribe to raw COB tick updates"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.cob_data_callbacks.append(callback)
|
|
logger.info(f"COB raw tick subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def subscribe_to_cob_aggregated(self, callback: Callable[[str, Dict], None]) -> str:
|
|
"""Subscribe to 1s aggregated COB updates"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.cob_aggregated_callbacks.append(callback)
|
|
logger.info(f"COB aggregated subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def get_cob_price_buckets(self, symbol: str, timeframe_seconds: int = 60) -> Dict:
|
|
"""Get price bucket analysis for a timeframe"""
|
|
try:
|
|
# Get aggregated data for the timeframe
|
|
count = min(timeframe_seconds, 900) # Max 15 minutes
|
|
aggregated_data = self.get_cob_1s_aggregated(symbol, count)
|
|
|
|
if not aggregated_data:
|
|
return {}
|
|
|
|
# Combine buckets across the timeframe
|
|
combined_bid_buckets = {}
|
|
combined_ask_buckets = {}
|
|
|
|
for data in aggregated_data:
|
|
bid_buckets = data.get('bid_buckets', {})
|
|
ask_buckets = data.get('ask_buckets', {})
|
|
|
|
for bucket, volume in bid_buckets.items():
|
|
if bucket not in combined_bid_buckets:
|
|
combined_bid_buckets[bucket] = 0
|
|
combined_bid_buckets[bucket] += volume
|
|
|
|
for bucket, volume in ask_buckets.items():
|
|
if bucket not in combined_ask_buckets:
|
|
combined_ask_buckets[bucket] = 0
|
|
combined_ask_buckets[bucket] += volume
|
|
|
|
return {
|
|
'symbol': symbol,
|
|
'timeframe_seconds': timeframe_seconds,
|
|
'bucket_size_usd': 1.0,
|
|
'bid_buckets': dict(sorted(combined_bid_buckets.items(), reverse=True)),
|
|
'ask_buckets': dict(sorted(combined_ask_buckets.items())),
|
|
'total_bid_volume': sum(combined_bid_buckets.values()),
|
|
'total_ask_volume': sum(combined_ask_buckets.values()),
|
|
'bucket_count': len(combined_bid_buckets) + len(combined_ask_buckets)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB price buckets for {symbol}: {e}")
|
|
return {}
|
|
|
|
def get_cob_websocket_status(self) -> Dict[str, Any]:
|
|
"""Get COB WebSocket status"""
|
|
try:
|
|
if self.cob_integration:
|
|
return {
|
|
'status': 'active',
|
|
'symbols': self.symbols,
|
|
'websocket_status': self.cob_integration.get_websocket_status(),
|
|
'raw_tick_counts': {symbol: len(self.cob_raw_ticks[symbol]) for symbol in self.symbols},
|
|
'aggregated_counts': {symbol: len(self.cob_1s_aggregated[symbol]) for symbol in self.symbols}
|
|
}
|
|
else:
|
|
return {
|
|
'status': 'inactive',
|
|
'symbols': self.symbols,
|
|
'error': 'COB integration not initialized'
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB WebSocket status: {e}")
|
|
return {
|
|
'status': 'error',
|
|
'error': str(e)
|
|
}
|
|
|
|
async def _start_fallback_websocket_streaming(self):
|
|
"""Fallback to old WebSocket method if Enhanced COB WebSocket fails"""
|
|
try:
|
|
logger.warning("⚠️ Starting fallback WebSocket streaming")
|
|
|
|
# Start old WebSocket for each symbol
|
|
for symbol in self.symbols:
|
|
task = asyncio.create_task(self._websocket_stream(symbol))
|
|
self.websocket_tasks[symbol] = task
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Error starting fallback WebSocket: {e}")
|
|
|
|
def get_cob_websocket_status(self) -> Dict[str, Any]:
|
|
"""Get COB WebSocket status for dashboard"""
|
|
try:
|
|
if self.enhanced_cob_websocket:
|
|
return self.enhanced_cob_websocket.get_status_summary()
|
|
else:
|
|
return {
|
|
'overall_status': 'not_initialized',
|
|
'symbols': {},
|
|
'websockets_available': False
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB WebSocket status: {e}")
|
|
return {
|
|
'overall_status': 'error',
|
|
'symbols': {},
|
|
'error': str(e)
|
|
}
|
|
|
|
def get_latest_cob_data(self, symbol: str) -> Optional[Dict]:
|
|
"""Get latest COB data from Enhanced WebSocket"""
|
|
try:
|
|
return self.cob_websocket_data.get(symbol)
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest COB data for {symbol}: {e}")
|
|
return None
|
|
|
|
async def _websocket_stream(self, symbol: str):
|
|
"""WebSocket stream for a single symbol using trade stream for better granularity"""
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
url = f"wss://stream.binance.com:9443/ws/{binance_symbol.lower()}@trade"
|
|
|
|
while self.is_streaming:
|
|
try:
|
|
logger.info(f"Connecting to WebSocket for {symbol}: {url}")
|
|
|
|
async with websockets.connect(url) as websocket:
|
|
logger.info(f"WebSocket connected for {symbol}")
|
|
|
|
async for message in websocket:
|
|
if not self.is_streaming:
|
|
break
|
|
|
|
try:
|
|
await self._process_trade_message(binance_symbol, message)
|
|
except Exception as e:
|
|
logger.warning(f"Error processing trade message for {symbol}: {e}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error for {symbol}: {e}")
|
|
self.distribution_stats['distribution_errors'] += 1
|
|
|
|
if self.is_streaming:
|
|
logger.info(f"Reconnecting WebSocket for {symbol} in 5 seconds...")
|
|
await asyncio.sleep(5)
|
|
|
|
async def _process_trade_message(self, symbol: str, message: str):
|
|
"""Process incoming trade message and distribute to subscribers"""
|
|
try:
|
|
trade_data = json.loads(message)
|
|
|
|
# Extract trade information
|
|
price = float(trade_data.get('p', 0))
|
|
quantity = float(trade_data.get('q', 0))
|
|
timestamp = datetime.fromtimestamp(int(trade_data.get('T', 0)) / 1000)
|
|
is_buyer_maker = trade_data.get('m', False)
|
|
trade_id = trade_data.get('t', '')
|
|
|
|
# Calculate volume in USDT
|
|
volume_usdt = price * quantity
|
|
|
|
# Data validation
|
|
if not self._validate_tick_data(symbol, price, volume_usdt):
|
|
logger.warning(f"Invalid tick data for {symbol}: price={price}, volume={volume_usdt}")
|
|
return
|
|
|
|
# Process raw tick through aggregator
|
|
side = 'sell' if is_buyer_maker else 'buy'
|
|
raw_tick, completed_bar = self.tick_aggregator.process_tick(
|
|
symbol=symbol,
|
|
timestamp=timestamp,
|
|
price=price,
|
|
volume=volume_usdt,
|
|
quantity=quantity,
|
|
side=side,
|
|
trade_id=str(trade_id)
|
|
)
|
|
|
|
# Update statistics
|
|
self.distribution_stats['total_ticks_received'] += 1
|
|
self.distribution_stats['ticks_per_symbol'][symbol] += 1
|
|
self.distribution_stats['last_tick_time'][symbol] = timestamp
|
|
self.last_prices[symbol] = price
|
|
|
|
if raw_tick:
|
|
self.distribution_stats['raw_ticks_processed'] += 1
|
|
|
|
# Notify raw tick callbacks
|
|
for callback in self.raw_tick_callbacks:
|
|
try:
|
|
callback(raw_tick)
|
|
except Exception as e:
|
|
logger.error(f"Error in raw tick callback: {e}")
|
|
|
|
if completed_bar:
|
|
self.distribution_stats['ohlcv_bars_created'] += 1
|
|
|
|
# Notify OHLCV bar callbacks
|
|
for callback in self.ohlcv_bar_callbacks:
|
|
try:
|
|
callback(completed_bar)
|
|
except Exception as e:
|
|
logger.error(f"Error in OHLCV bar callback: {e}")
|
|
|
|
# Create standardized tick for legacy compatibility
|
|
tick = MarketTick(
|
|
symbol=symbol,
|
|
timestamp=timestamp,
|
|
price=price,
|
|
volume=volume_usdt,
|
|
quantity=quantity,
|
|
side=side,
|
|
trade_id=str(trade_id),
|
|
is_buyer_maker=is_buyer_maker,
|
|
raw_data=trade_data
|
|
)
|
|
|
|
# Add to buffer
|
|
self.tick_buffers[symbol].append(tick)
|
|
|
|
# Update current prices and candles
|
|
await self._process_tick(symbol, tick)
|
|
|
|
# Distribute to all subscribers
|
|
self._distribute_tick(tick)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing trade message for {symbol}: {e}")
|
|
|
|
async def _process_tick(self, symbol: str, tick: MarketTick):
|
|
"""Process a single tick and update candles"""
|
|
try:
|
|
# Update current price
|
|
with self.data_lock:
|
|
self.current_prices[symbol] = tick.price
|
|
|
|
# Initialize real-time data structure if needed
|
|
if symbol not in self.real_time_data:
|
|
self.real_time_data[symbol] = {}
|
|
for tf in self.timeframes:
|
|
self.real_time_data[symbol][tf] = deque(maxlen=1000)
|
|
|
|
# Create tick record for candle updates
|
|
tick_record = {
|
|
'timestamp': tick.timestamp,
|
|
'price': tick.price,
|
|
'volume': tick.volume
|
|
}
|
|
|
|
# Update all timeframes
|
|
for timeframe in self.timeframes:
|
|
self._update_candle(symbol, timeframe, tick_record)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing tick for {symbol}: {e}")
|
|
|
|
def _update_candle(self, symbol: str, timeframe: str, tick: Dict):
|
|
"""Update candle for specific timeframe"""
|
|
try:
|
|
timeframe_secs = self.timeframe_seconds.get(timeframe, 3600)
|
|
current_time = tick['timestamp']
|
|
|
|
# Calculate candle start time using proper datetime truncation
|
|
if isinstance(current_time, datetime):
|
|
timestamp_seconds = current_time.timestamp()
|
|
else:
|
|
timestamp_seconds = current_time.timestamp() if hasattr(current_time, 'timestamp') else current_time
|
|
|
|
# Truncate to timeframe boundary
|
|
candle_start_seconds = int(timestamp_seconds // timeframe_secs) * timeframe_secs
|
|
candle_start = datetime.fromtimestamp(candle_start_seconds)
|
|
|
|
# Get current candle queue
|
|
candle_queue = self.real_time_data[symbol][timeframe]
|
|
|
|
# Check if we need a new candle
|
|
if not candle_queue or candle_queue[-1]['timestamp'] != candle_start:
|
|
# Create new candle
|
|
new_candle = {
|
|
'timestamp': candle_start,
|
|
'open': tick['price'],
|
|
'high': tick['price'],
|
|
'low': tick['price'],
|
|
'close': tick['price'],
|
|
'volume': tick['volume']
|
|
}
|
|
candle_queue.append(new_candle)
|
|
else:
|
|
# Update existing candle
|
|
current_candle = candle_queue[-1]
|
|
current_candle['high'] = max(current_candle['high'], tick['price'])
|
|
current_candle['low'] = min(current_candle['low'], tick['price'])
|
|
current_candle['close'] = tick['price']
|
|
current_candle['volume'] += tick['volume']
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating candle for {symbol} {timeframe}: {e}")
|
|
|
|
def get_latest_candles(self, symbol: str, timeframe: str, limit: int = 100) -> pd.DataFrame:
|
|
"""Get the latest candles from cached data only"""
|
|
try:
|
|
# Get cached data
|
|
cached_df = self.get_historical_data(symbol, timeframe, limit=limit)
|
|
|
|
# Get real-time data if available
|
|
with self.data_lock:
|
|
if symbol in self.real_time_data and timeframe in self.real_time_data[symbol]:
|
|
real_time_candles = list(self.real_time_data[symbol][timeframe])
|
|
|
|
if real_time_candles:
|
|
# Convert to DataFrame
|
|
rt_df = pd.DataFrame(real_time_candles)
|
|
|
|
if cached_df is not None and not cached_df.empty:
|
|
# Combine cached and real-time
|
|
# Remove overlapping candles from cached data
|
|
if not rt_df.empty:
|
|
cutoff_time = rt_df['timestamp'].min()
|
|
cached_df = cached_df[cached_df.index < cutoff_time]
|
|
|
|
# Concatenate
|
|
combined_df = pd.concat([cached_df, rt_df], ignore_index=True)
|
|
else:
|
|
combined_df = rt_df
|
|
|
|
return combined_df.tail(limit)
|
|
|
|
# Return just cached data if no real-time data
|
|
return cached_df.tail(limit) if cached_df is not None else pd.DataFrame()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest candles for {symbol} {timeframe}: {e}")
|
|
return pd.DataFrame()
|
|
|
|
def get_current_price(self, symbol: str) -> Optional[float]:
|
|
"""Get current price for a symbol from cached data"""
|
|
try:
|
|
# Try to get from 1s candle first (most recent)
|
|
for tf in ['1s', '1m', '1h', '1d']:
|
|
if symbol in self.cached_data and tf in self.cached_data[symbol]:
|
|
df = self.cached_data[symbol][tf]
|
|
if not df.empty:
|
|
return float(df.iloc[-1]['close'])
|
|
|
|
logger.warning(f"No cached price data available for {symbol}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting current price for {symbol}: {e}")
|
|
return None
|
|
|
|
def calculate_williams_pivot_points(self, symbol: str, force_recalculate: bool = False) -> Dict[int, TrendLevel]:
|
|
"""
|
|
Calculate Williams Market Structure pivot points for a symbol
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'ETH/USDT')
|
|
force_recalculate: Force recalculation even if cache is fresh
|
|
|
|
Returns:
|
|
Dictionary of trend levels with pivot points
|
|
"""
|
|
try:
|
|
# Check if we need to recalculate
|
|
now = datetime.now()
|
|
if (not force_recalculate and
|
|
symbol in self.last_pivot_calculation and
|
|
now - self.last_pivot_calculation[symbol] < self.pivot_calculation_interval):
|
|
# Return cached results
|
|
return self.pivot_points_cache.get(symbol, {})
|
|
|
|
# Get 1s OHLCV data for Williams Market Structure calculation
|
|
df_1s = self.get_historical_data(symbol, '1s', limit=1000)
|
|
if df_1s is None or len(df_1s) < 50:
|
|
logger.warning(f"Insufficient 1s data for Williams pivot calculation: {symbol}")
|
|
return {}
|
|
|
|
# Convert DataFrame to numpy array for Williams calculation
|
|
# Format: [timestamp_ms, open, high, low, close, volume]
|
|
ohlcv_array = np.column_stack([
|
|
df_1s.index.astype(np.int64) // 10**6, # Convert to milliseconds
|
|
df_1s['open'].values,
|
|
df_1s['high'].values,
|
|
df_1s['low'].values,
|
|
df_1s['close'].values,
|
|
df_1s['volume'].values
|
|
])
|
|
|
|
# Calculate recursive pivot points using Williams Market Structure
|
|
williams = self.williams_structure[symbol]
|
|
pivot_levels = williams.calculate_recursive_pivot_points(ohlcv_array)
|
|
|
|
# Cache the results
|
|
self.pivot_points_cache[symbol] = pivot_levels
|
|
self.last_pivot_calculation[symbol] = now
|
|
|
|
logger.debug(f"Calculated Williams pivot points for {symbol}: {len(pivot_levels)} levels")
|
|
return pivot_levels
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating Williams pivot points for {symbol}: {e}")
|
|
return {}
|
|
|
|
def get_pivot_features_for_ml(self, symbol: str) -> np.ndarray:
|
|
"""
|
|
Get pivot point features for machine learning models
|
|
|
|
Returns a 250-element feature vector containing:
|
|
- Recent pivot points (price, strength, type) for each level
|
|
- Trend direction and strength for each level
|
|
- Time since last pivot for each level
|
|
"""
|
|
try:
|
|
# Ensure we have fresh pivot points
|
|
pivot_levels = self.calculate_williams_pivot_points(symbol)
|
|
|
|
if not pivot_levels:
|
|
logger.warning(f"No pivot points available for {symbol}")
|
|
return np.zeros(250, dtype=np.float32)
|
|
|
|
# Use Williams Market Structure to extract ML features
|
|
williams = self.williams_structure[symbol]
|
|
features = williams.get_pivot_features_for_ml(symbol)
|
|
|
|
return features
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting pivot features for ML: {e}")
|
|
return np.zeros(250, dtype=np.float32)
|
|
|
|
def get_market_structure_summary(self, symbol: str) -> Dict[str, Any]:
|
|
"""
|
|
Get current market structure summary for dashboard display
|
|
|
|
Returns:
|
|
Dictionary containing market structure information
|
|
"""
|
|
try:
|
|
# Ensure we have fresh pivot points
|
|
pivot_levels = self.calculate_williams_pivot_points(symbol)
|
|
|
|
if not pivot_levels:
|
|
return {
|
|
'symbol': symbol,
|
|
'levels': {},
|
|
'overall_trend': 'sideways',
|
|
'overall_strength': 0.0,
|
|
'last_update': datetime.now().isoformat(),
|
|
'error': 'No pivot points available'
|
|
}
|
|
|
|
# Use Williams Market Structure to get summary
|
|
williams = self.williams_structure[symbol]
|
|
structure = williams.get_current_market_structure()
|
|
structure['symbol'] = symbol
|
|
|
|
return structure
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting market structure summary for {symbol}: {e}")
|
|
return {
|
|
'symbol': symbol,
|
|
'levels': {},
|
|
'overall_trend': 'sideways',
|
|
'overall_strength': 0.0,
|
|
'last_update': datetime.now().isoformat(),
|
|
'error': str(e)
|
|
}
|
|
|
|
def get_recent_pivot_points(self, symbol: str, level: int = 1, count: int = 10) -> List[PivotPoint]:
|
|
"""
|
|
Get recent pivot points for a specific level
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
level: Pivot level (1-5)
|
|
count: Number of recent pivots to return
|
|
|
|
Returns:
|
|
List of recent pivot points
|
|
"""
|
|
try:
|
|
pivot_levels = self.calculate_williams_pivot_points(symbol)
|
|
|
|
if level not in pivot_levels:
|
|
return []
|
|
|
|
trend_level = pivot_levels[level]
|
|
recent_pivots = trend_level.pivot_points[-count:] if len(trend_level.pivot_points) >= count else trend_level.pivot_points
|
|
|
|
return recent_pivots
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting recent pivot points for {symbol} level {level}: {e}")
|
|
return []
|
|
|
|
def get_price_at_index(self, symbol: str, index: int, timeframe: str = '1m') -> Optional[float]:
|
|
"""Get price at specific index for backtesting from cached data"""
|
|
try:
|
|
if symbol in self.cached_data and timeframe in self.cached_data[symbol]:
|
|
df = self.cached_data[symbol][timeframe]
|
|
if not df.empty and 0 <= index < len(df):
|
|
return float(df.iloc[index]['close'])
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting price at index {index}: {e}")
|
|
return None
|
|
|
|
def get_feature_matrix(self, symbol: str, timeframes: List[str] = None,
|
|
window_size: int = 20) -> Optional[np.ndarray]:
|
|
"""
|
|
Get comprehensive feature matrix for multiple timeframes with technical indicators
|
|
|
|
Returns:
|
|
np.ndarray: Shape (n_timeframes, window_size, n_features)
|
|
Each timeframe becomes a separate channel for CNN
|
|
"""
|
|
try:
|
|
if timeframes is None:
|
|
timeframes = self.timeframes
|
|
|
|
feature_channels = []
|
|
common_feature_names = None
|
|
|
|
# First pass: determine common features across all timeframes
|
|
timeframe_features = {}
|
|
for tf in timeframes:
|
|
logger.debug(f"Processing timeframe {tf} for {symbol}")
|
|
# Use cached data directly
|
|
if symbol in self.cached_data and tf in self.cached_data[symbol]:
|
|
df = self.cached_data[symbol][tf]
|
|
if not df.empty and len(df) >= window_size:
|
|
df = df.tail(window_size + 100) # Get enough data for indicators
|
|
else:
|
|
logger.warning(f"Insufficient cached data for {symbol} {tf}: {len(df) if not df.empty else 0} rows")
|
|
continue
|
|
else:
|
|
logger.warning(f"No cached data for {symbol} {tf}")
|
|
continue
|
|
|
|
# Get feature columns
|
|
basic_cols = ['open', 'high', 'low', 'close', 'volume']
|
|
indicator_cols = [col for col in df.columns
|
|
if col not in basic_cols + ['timestamp'] and not col.startswith('unnamed')]
|
|
|
|
selected_features = self._select_cnn_features(df, basic_cols, indicator_cols)
|
|
timeframe_features[tf] = (df, selected_features)
|
|
|
|
if common_feature_names is None:
|
|
common_feature_names = set(selected_features)
|
|
else:
|
|
common_feature_names = common_feature_names.intersection(set(selected_features))
|
|
|
|
if not common_feature_names:
|
|
logger.error(f"No common features found across timeframes for {symbol}")
|
|
return None
|
|
|
|
# Convert to sorted list for consistent ordering
|
|
common_feature_names = sorted(list(common_feature_names))
|
|
# logger.info(f"Using {len(common_feature_names)} common features: {common_feature_names}")
|
|
|
|
# Second pass: create feature channels with common features
|
|
for tf in timeframes:
|
|
if tf not in timeframe_features:
|
|
continue
|
|
|
|
df, _ = timeframe_features[tf]
|
|
|
|
# Use only common features
|
|
try:
|
|
tf_features = self._normalize_features(df[common_feature_names].tail(window_size), symbol=symbol)
|
|
|
|
if tf_features is not None and len(tf_features) == window_size:
|
|
feature_channels.append(tf_features.values)
|
|
logger.debug(f"Added {len(common_feature_names)} features for {tf}")
|
|
else:
|
|
logger.warning(f"Feature normalization failed for {tf}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing features for {tf}: {e}")
|
|
continue
|
|
|
|
if not feature_channels:
|
|
logger.error(f"No valid feature channels created for {symbol}")
|
|
return None
|
|
|
|
# Verify all channels have the same shape
|
|
shapes = [channel.shape for channel in feature_channels]
|
|
if len(set(shapes)) > 1:
|
|
logger.error(f"Shape mismatch in feature channels: {shapes}")
|
|
return None
|
|
|
|
# Stack all timeframe channels
|
|
feature_matrix = np.stack(feature_channels, axis=0)
|
|
|
|
logger.debug(f"Created feature matrix for {symbol}: {feature_matrix.shape} "
|
|
f"({len(feature_channels)} timeframes, {window_size} steps, {len(common_feature_names)} features)")
|
|
|
|
return feature_matrix
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating feature matrix for {symbol}: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return None
|
|
|
|
def _select_cnn_features(self, df: pd.DataFrame, basic_cols: List[str], indicator_cols: List[str]) -> List[str]:
|
|
"""Select the most important features for CNN training"""
|
|
try:
|
|
selected = []
|
|
|
|
# Always include basic OHLCV (normalized)
|
|
selected.extend(basic_cols)
|
|
|
|
# Priority indicators (most informative for CNNs)
|
|
priority_indicators = [
|
|
# Trend indicators
|
|
'sma_10', 'sma_20', 'sma_50', 'ema_12', 'ema_26', 'ema_50',
|
|
'macd', 'macd_signal', 'macd_histogram',
|
|
'adx', 'adx_pos', 'adx_neg', 'psar',
|
|
|
|
# Momentum indicators
|
|
'rsi_14', 'rsi_7', 'rsi_21',
|
|
'stoch_k', 'stoch_d', 'williams_r', 'ultimate_osc',
|
|
|
|
# Volatility indicators
|
|
'bb_upper', 'bb_lower', 'bb_middle', 'bb_width', 'bb_percent',
|
|
'atr', 'keltner_upper', 'keltner_lower', 'keltner_middle',
|
|
|
|
# Volume indicators
|
|
'volume_sma_10', 'volume_sma_20', 'obv', 'vpt', 'mfi', 'ad_line', 'vwap',
|
|
|
|
# Price action
|
|
'price_position', 'true_range', 'roc',
|
|
|
|
# Custom composites
|
|
'trend_strength', 'momentum_composite', 'volatility_regime'
|
|
]
|
|
|
|
# Add available priority indicators
|
|
for indicator in priority_indicators:
|
|
if indicator in indicator_cols:
|
|
selected.append(indicator)
|
|
|
|
# Add any other technical indicators not in priority list (limit to avoid curse of dimensionality)
|
|
remaining_indicators = [col for col in indicator_cols if col not in selected]
|
|
if remaining_indicators:
|
|
# Limit to 10 additional indicators
|
|
selected.extend(remaining_indicators[:10])
|
|
|
|
# Verify all selected features exist in dataframe
|
|
final_selected = [col for col in selected if col in df.columns]
|
|
|
|
logger.debug(f"Selected {len(final_selected)} features from {len(df.columns)} available columns")
|
|
return final_selected
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error selecting CNN features: {e}")
|
|
return basic_cols # Fallback to basic OHLCV
|
|
|
|
def _normalize_features(self, df: pd.DataFrame, symbol: str = None) -> Optional[pd.DataFrame]:
|
|
"""Normalize features for CNN training using pivot-based bounds when available"""
|
|
try:
|
|
df_norm = df.copy()
|
|
|
|
# Try to use pivot-based normalization if available
|
|
if symbol and symbol in self.pivot_bounds:
|
|
bounds = self.pivot_bounds[symbol]
|
|
price_range = bounds.get_price_range()
|
|
|
|
# Normalize price-based features using pivot bounds
|
|
price_cols = ['open', 'high', 'low', 'close', 'sma_10', 'sma_20', 'sma_50',
|
|
'ema_12', 'ema_26', 'ema_50', 'bb_upper', 'bb_lower', 'bb_middle',
|
|
'keltner_upper', 'keltner_lower', 'keltner_middle', 'psar', 'vwap']
|
|
|
|
for col in price_cols:
|
|
if col in df_norm.columns:
|
|
# Use pivot bounds for normalization
|
|
df_norm[col] = (df_norm[col] - bounds.price_min) / price_range
|
|
|
|
# Normalize volume using pivot bounds
|
|
if 'volume' in df_norm.columns:
|
|
volume_range = bounds.volume_max - bounds.volume_min
|
|
if volume_range > 0:
|
|
df_norm['volume'] = (df_norm['volume'] - bounds.volume_min) / volume_range
|
|
else:
|
|
df_norm['volume'] = 0.5 # Default to middle if no volume range
|
|
|
|
logger.debug(f"Applied pivot-based normalization for {symbol}")
|
|
|
|
else:
|
|
# Fallback to traditional normalization when pivot bounds not available
|
|
logger.debug("Using traditional normalization (no pivot bounds available)")
|
|
|
|
for col in df_norm.columns:
|
|
if col in ['open', 'high', 'low', 'close', 'sma_10', 'sma_20', 'sma_50',
|
|
'ema_12', 'ema_26', 'ema_50', 'bb_upper', 'bb_lower', 'bb_middle',
|
|
'keltner_upper', 'keltner_lower', 'keltner_middle', 'psar', 'vwap']:
|
|
# Price-based indicators: normalize by close price
|
|
if 'close' in df_norm.columns:
|
|
base_price = df_norm['close'].iloc[-1] # Use latest close as reference
|
|
if base_price > 0:
|
|
df_norm[col] = df_norm[col] / base_price
|
|
|
|
elif col == 'volume':
|
|
# Volume: normalize by its own rolling mean
|
|
volume_mean = df_norm[col].rolling(window=min(20, len(df_norm))).mean().iloc[-1]
|
|
if volume_mean > 0:
|
|
df_norm[col] = df_norm[col] / volume_mean
|
|
|
|
# Normalize indicators that have standard ranges (regardless of pivot bounds)
|
|
for col in df_norm.columns:
|
|
if col in ['rsi_14', 'rsi_7', 'rsi_21']:
|
|
# RSI: already 0-100, normalize to 0-1
|
|
df_norm[col] = df_norm[col] / 100.0
|
|
|
|
elif col in ['stoch_k', 'stoch_d']:
|
|
# Stochastic: already 0-100, normalize to 0-1
|
|
df_norm[col] = df_norm[col] / 100.0
|
|
|
|
elif col == 'williams_r':
|
|
# Williams %R: -100 to 0, normalize to 0-1
|
|
df_norm[col] = (df_norm[col] + 100) / 100.0
|
|
|
|
elif col in ['macd', 'macd_signal', 'macd_histogram']:
|
|
# MACD: normalize by ATR or close price
|
|
if 'atr' in df_norm.columns and df_norm['atr'].iloc[-1] > 0:
|
|
df_norm[col] = df_norm[col] / df_norm['atr'].iloc[-1]
|
|
elif 'close' in df_norm.columns and df_norm['close'].iloc[-1] > 0:
|
|
df_norm[col] = df_norm[col] / df_norm['close'].iloc[-1]
|
|
|
|
elif col in ['bb_width', 'bb_percent', 'price_position', 'trend_strength',
|
|
'momentum_composite', 'volatility_regime', 'pivot_price_position',
|
|
'pivot_support_distance', 'pivot_resistance_distance']:
|
|
# Already normalized indicators: ensure 0-1 range
|
|
df_norm[col] = np.clip(df_norm[col], 0, 1)
|
|
|
|
elif col in ['atr', 'true_range']:
|
|
# Volatility indicators: normalize by close price or pivot range
|
|
if symbol and symbol in self.pivot_bounds:
|
|
bounds = self.pivot_bounds[symbol]
|
|
df_norm[col] = df_norm[col] / bounds.get_price_range()
|
|
elif 'close' in df_norm.columns and df_norm['close'].iloc[-1] > 0:
|
|
df_norm[col] = df_norm[col] / df_norm['close'].iloc[-1]
|
|
|
|
elif col not in ['timestamp', 'near_pivot_support', 'near_pivot_resistance']:
|
|
# Other indicators: z-score normalization
|
|
col_mean = df_norm[col].rolling(window=min(20, len(df_norm))).mean().iloc[-1]
|
|
col_std = df_norm[col].rolling(window=min(20, len(df_norm))).std().iloc[-1]
|
|
if col_std > 0:
|
|
df_norm[col] = (df_norm[col] - col_mean) / col_std
|
|
else:
|
|
df_norm[col] = 0
|
|
|
|
# Replace inf/-inf with 0
|
|
df_norm = df_norm.replace([np.inf, -np.inf], 0)
|
|
|
|
# Fill any remaining NaN values
|
|
df_norm = df_norm.fillna(0)
|
|
|
|
return df_norm
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error normalizing features: {e}")
|
|
return df
|
|
|
|
def get_multi_symbol_feature_matrix(self, symbols: List[str] = None,
|
|
timeframes: List[str] = None,
|
|
window_size: int = 20) -> Optional[np.ndarray]:
|
|
"""
|
|
Get feature matrix for multiple symbols and timeframes
|
|
|
|
Returns:
|
|
np.ndarray: Shape (n_symbols, n_timeframes, window_size, n_features)
|
|
"""
|
|
try:
|
|
if symbols is None:
|
|
symbols = self.symbols
|
|
if timeframes is None:
|
|
timeframes = self.timeframes
|
|
|
|
symbol_matrices = []
|
|
|
|
for symbol in symbols:
|
|
symbol_matrix = self.get_feature_matrix(symbol, timeframes, window_size)
|
|
if symbol_matrix is not None:
|
|
symbol_matrices.append(symbol_matrix)
|
|
else:
|
|
logger.warning(f"Could not create feature matrix for {symbol}")
|
|
|
|
if symbol_matrices:
|
|
# Stack all symbol matrices
|
|
multi_symbol_matrix = np.stack(symbol_matrices, axis=0)
|
|
logger.info(f"Created multi-symbol feature matrix: {multi_symbol_matrix.shape}")
|
|
return multi_symbol_matrix
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating multi-symbol feature matrix: {e}")
|
|
return None
|
|
|
|
def health_check(self) -> Dict[str, Any]:
|
|
"""Get health status of the data provider"""
|
|
status = {
|
|
'streaming': self.is_streaming,
|
|
'data_maintenance_active': self.data_maintenance_active,
|
|
'symbols': len(self.symbols),
|
|
'timeframes': len(self.timeframes),
|
|
'current_prices': len(self.current_prices),
|
|
'websocket_tasks': len(self.websocket_tasks),
|
|
'cached_data_loaded': {}
|
|
}
|
|
|
|
# Check cached data availability
|
|
for symbol in self.symbols:
|
|
status['cached_data_loaded'][symbol] = {}
|
|
for tf in self.timeframes:
|
|
has_data = (symbol in self.cached_data and
|
|
tf in self.cached_data[symbol] and
|
|
not self.cached_data[symbol][tf].empty)
|
|
candle_count = len(self.cached_data[symbol][tf]) if has_data else 0
|
|
status['cached_data_loaded'][symbol][tf] = {
|
|
'has_data': has_data,
|
|
'candle_count': candle_count
|
|
}
|
|
|
|
return status
|
|
|
|
def get_cached_data_summary(self) -> Dict[str, Any]:
|
|
"""Get summary of cached data"""
|
|
summary = {
|
|
'symbols': self.symbols,
|
|
'timeframes': self.timeframes,
|
|
'data_maintenance_active': self.data_maintenance_active,
|
|
'cached_data': {}
|
|
}
|
|
|
|
for symbol in self.symbols:
|
|
summary['cached_data'][symbol] = {}
|
|
for timeframe in self.timeframes:
|
|
if symbol in self.cached_data and timeframe in self.cached_data[symbol]:
|
|
df = self.cached_data[symbol][timeframe]
|
|
if not df.empty:
|
|
summary['cached_data'][symbol][timeframe] = {
|
|
'candle_count': len(df),
|
|
'start_time': df.index[0].isoformat() if hasattr(df.index[0], 'isoformat') else str(df.index[0]),
|
|
'end_time': df.index[-1].isoformat() if hasattr(df.index[-1], 'isoformat') else str(df.index[-1]),
|
|
'latest_price': float(df.iloc[-1]['close'])
|
|
}
|
|
else:
|
|
summary['cached_data'][symbol][timeframe] = {
|
|
'candle_count': 0,
|
|
'status': 'empty'
|
|
}
|
|
else:
|
|
summary['cached_data'][symbol][timeframe] = {
|
|
'candle_count': 0,
|
|
'status': 'not_initialized'
|
|
}
|
|
|
|
return summary
|
|
|
|
def get_cob_data_quality(self) -> Dict[str, Any]:
|
|
"""Get COB data quality information"""
|
|
quality_info = {
|
|
'symbols': self.symbols,
|
|
'raw_ticks': {},
|
|
'aggregated_1s': {},
|
|
'imbalance_indicators': {},
|
|
'data_freshness': {}
|
|
}
|
|
|
|
try:
|
|
current_time = time.time()
|
|
|
|
for symbol in self.symbols:
|
|
# Raw ticks info
|
|
raw_ticks = list(self.cob_raw_ticks[symbol])
|
|
if raw_ticks:
|
|
latest_tick = raw_ticks[-1]
|
|
latest_timestamp = latest_tick['timestamp']
|
|
if isinstance(latest_timestamp, datetime):
|
|
age_seconds = current_time - latest_timestamp.timestamp()
|
|
else:
|
|
age_seconds = current_time - float(latest_timestamp)
|
|
else:
|
|
age_seconds = None
|
|
|
|
quality_info['raw_ticks'][symbol] = {
|
|
'count': len(raw_ticks),
|
|
'latest_timestamp': raw_ticks[-1]['timestamp'] if raw_ticks else None,
|
|
'age_seconds': age_seconds
|
|
}
|
|
|
|
# Aggregated 1s data info
|
|
aggregated_data = list(self.cob_1s_aggregated[symbol])
|
|
quality_info['aggregated_1s'][symbol] = {
|
|
'count': len(aggregated_data),
|
|
'latest_timestamp': aggregated_data[-1]['timestamp'] if aggregated_data else None,
|
|
'age_seconds': current_time - aggregated_data[-1]['timestamp'] if aggregated_data else None
|
|
}
|
|
|
|
# Imbalance indicators info
|
|
if aggregated_data:
|
|
latest_data = aggregated_data[-1]
|
|
quality_info['imbalance_indicators'][symbol] = {
|
|
'imbalance_1s': latest_data.get('imbalance_1s', 0),
|
|
'imbalance_5s': latest_data.get('imbalance_5s', 0),
|
|
'imbalance_15s': latest_data.get('imbalance_15s', 0),
|
|
'imbalance_60s': latest_data.get('imbalance_60s', 0),
|
|
'total_volume': latest_data.get('total_volume', 0),
|
|
'bucket_count': len(latest_data.get('bid_buckets', {})) + len(latest_data.get('ask_buckets', {}))
|
|
}
|
|
|
|
# Data freshness assessment
|
|
raw_age = quality_info['raw_ticks'][symbol]['age_seconds']
|
|
agg_age = quality_info['aggregated_1s'][symbol]['age_seconds']
|
|
|
|
if raw_age is not None and agg_age is not None:
|
|
if raw_age < 5 and agg_age < 5:
|
|
freshness = 'excellent'
|
|
elif raw_age < 15 and agg_age < 15:
|
|
freshness = 'good'
|
|
elif raw_age < 60 and agg_age < 60:
|
|
freshness = 'fair'
|
|
else:
|
|
freshness = 'stale'
|
|
else:
|
|
freshness = 'no_data'
|
|
|
|
quality_info['data_freshness'][symbol] = freshness
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting COB data quality: {e}")
|
|
quality_info['error'] = str(e)
|
|
|
|
return quality_info
|
|
|
|
def subscribe_to_ticks(self, callback: Callable[[MarketTick], None],
|
|
symbols: List[str] = None,
|
|
subscriber_name: str = None) -> str:
|
|
"""Subscribe to real-time tick data updates"""
|
|
subscriber_id = str(uuid.uuid4())[:8]
|
|
subscriber_name = subscriber_name or f"subscriber_{subscriber_id}"
|
|
|
|
# Convert symbols to Binance format
|
|
if symbols:
|
|
binance_symbols = [s.replace('/', '').upper() for s in symbols]
|
|
else:
|
|
binance_symbols = [s.replace('/', '').upper() for s in self.symbols]
|
|
|
|
subscriber = DataSubscriber(
|
|
subscriber_id=subscriber_id,
|
|
callback=callback,
|
|
symbols=binance_symbols,
|
|
subscriber_name=subscriber_name
|
|
)
|
|
|
|
with self.subscriber_lock:
|
|
self.subscribers[subscriber_id] = subscriber
|
|
|
|
logger.info(f"New tick subscriber registered: {subscriber_name} ({subscriber_id}) for symbols: {binance_symbols}")
|
|
|
|
# Send recent tick data to new subscriber
|
|
self._send_recent_ticks_to_subscriber(subscriber)
|
|
|
|
return subscriber_id
|
|
|
|
def unsubscribe_from_ticks(self, subscriber_id: str):
|
|
"""Unsubscribe from tick data updates"""
|
|
with self.subscriber_lock:
|
|
if subscriber_id in self.subscribers:
|
|
subscriber_name = self.subscribers[subscriber_id].subscriber_name
|
|
self.subscribers[subscriber_id].active = False
|
|
del self.subscribers[subscriber_id]
|
|
logger.info(f"Subscriber {subscriber_name} ({subscriber_id}) unsubscribed")
|
|
|
|
def _send_recent_ticks_to_subscriber(self, subscriber: DataSubscriber):
|
|
"""Send recent tick data to a new subscriber"""
|
|
try:
|
|
for symbol in subscriber.symbols:
|
|
if symbol in self.tick_buffers:
|
|
# Send last 50 ticks to get subscriber up to speed
|
|
recent_ticks = list(self.tick_buffers[symbol])[-50:]
|
|
for tick in recent_ticks:
|
|
try:
|
|
subscriber.callback(tick)
|
|
except Exception as e:
|
|
logger.warning(f"Error sending recent tick to subscriber {subscriber.subscriber_id}: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Error sending recent ticks: {e}")
|
|
|
|
def _distribute_tick(self, tick: MarketTick):
|
|
"""Distribute tick to all relevant subscribers"""
|
|
distributed_count = 0
|
|
|
|
with self.subscriber_lock:
|
|
subscribers_to_remove = []
|
|
|
|
for subscriber_id, subscriber in self.subscribers.items():
|
|
if not subscriber.active:
|
|
subscribers_to_remove.append(subscriber_id)
|
|
continue
|
|
|
|
if tick.symbol in subscriber.symbols:
|
|
try:
|
|
# Call subscriber callback in a thread to avoid blocking
|
|
def call_callback():
|
|
try:
|
|
subscriber.callback(tick)
|
|
subscriber.tick_count += 1
|
|
subscriber.last_update = datetime.now()
|
|
except Exception as e:
|
|
logger.warning(f"Error in subscriber {subscriber_id} callback: {e}")
|
|
subscriber.active = False
|
|
|
|
# Use thread to avoid blocking the main data processing
|
|
Thread(target=call_callback, daemon=True).start()
|
|
distributed_count += 1
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error distributing tick to subscriber {subscriber_id}: {e}")
|
|
subscriber.active = False
|
|
|
|
# Remove inactive subscribers
|
|
for subscriber_id in subscribers_to_remove:
|
|
if subscriber_id in self.subscribers:
|
|
del self.subscribers[subscriber_id]
|
|
|
|
self.distribution_stats['total_ticks_distributed'] += distributed_count
|
|
|
|
def _validate_tick_data(self, symbol: str, price: float, volume: float) -> bool:
|
|
"""Validate incoming tick data for quality"""
|
|
try:
|
|
# Basic validation
|
|
if price <= 0 or volume < 0:
|
|
return False
|
|
|
|
# Price change validation
|
|
last_price = self.last_prices.get(symbol, 0)
|
|
if last_price > 0:
|
|
price_change_pct = abs(price - last_price) / last_price
|
|
if price_change_pct > self.price_change_threshold:
|
|
logger.warning(f"Large price change for {symbol}: {price_change_pct:.2%}")
|
|
# Don't reject, just warn - could be legitimate
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error validating tick data: {e}")
|
|
return False
|
|
|
|
def get_recent_ticks(self, symbol: str, count: int = 100) -> List[MarketTick]:
|
|
"""Get recent ticks for a symbol"""
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
if binance_symbol in self.tick_buffers:
|
|
return list(self.tick_buffers[binance_symbol])[-count:]
|
|
return []
|
|
|
|
def subscribe_to_raw_ticks(self, callback: Callable[[RawTick], None]) -> str:
|
|
"""Subscribe to raw tick data with timing information"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.raw_tick_callbacks.append(callback)
|
|
logger.info(f"Raw tick subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def subscribe_to_ohlcv_bars(self, callback: Callable[[OHLCVBar], None]) -> str:
|
|
"""Subscribe to 1s OHLCV bars calculated from ticks"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.ohlcv_bar_callbacks.append(callback)
|
|
logger.info(f"OHLCV bar subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def get_raw_tick_features(self, symbol: str, window_size: int = 50) -> Optional[np.ndarray]:
|
|
"""Get raw tick features for model consumption"""
|
|
return self.tick_aggregator.get_tick_features_for_model(symbol, window_size)
|
|
|
|
def get_ohlcv_features(self, symbol: str, window_size: int = 60) -> Optional[np.ndarray]:
|
|
"""Get 1s OHLCV features for model consumption"""
|
|
return self.tick_aggregator.get_ohlcv_features_for_model(symbol, window_size)
|
|
|
|
def get_detected_patterns(self, symbol: str, count: int = 50) -> List:
|
|
"""Get recently detected tick patterns"""
|
|
return self.tick_aggregator.get_detected_patterns(symbol, count)
|
|
|
|
def get_tick_aggregator_stats(self) -> Dict[str, Any]:
|
|
"""Get tick aggregator statistics"""
|
|
return self.tick_aggregator.get_statistics()
|
|
|
|
def get_subscriber_stats(self) -> Dict[str, Any]:
|
|
"""Get subscriber and distribution statistics"""
|
|
with self.subscriber_lock:
|
|
active_subscribers = len([s for s in self.subscribers.values() if s.active])
|
|
subscriber_stats = {
|
|
sid: {
|
|
'name': s.subscriber_name,
|
|
'active': s.active,
|
|
'symbols': s.symbols,
|
|
'tick_count': s.tick_count,
|
|
'last_update': s.last_update.isoformat() if s.last_update else None
|
|
}
|
|
for sid, s in self.subscribers.items()
|
|
}
|
|
|
|
# Get tick aggregator stats
|
|
aggregator_stats = self.get_tick_aggregator_stats()
|
|
|
|
return {
|
|
'active_subscribers': active_subscribers,
|
|
'total_subscribers': len(self.subscribers),
|
|
'raw_tick_callbacks': len(self.raw_tick_callbacks),
|
|
'ohlcv_bar_callbacks': len(self.ohlcv_bar_callbacks),
|
|
'subscriber_details': subscriber_stats,
|
|
'distribution_stats': self.distribution_stats.copy(),
|
|
'buffer_sizes': {symbol: len(buffer) for symbol, buffer in self.tick_buffers.items()},
|
|
'tick_aggregator': aggregator_stats
|
|
}
|
|
|
|
def update_bom_cache(self, symbol: str, bom_features: List[float], cob_integration=None):
|
|
"""
|
|
Update BOM cache with latest features for a symbol
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'ETH/USDT')
|
|
bom_features: List of BOM features (should be 120 features)
|
|
cob_integration: Optional COB integration instance for real BOM data
|
|
"""
|
|
try:
|
|
current_time = datetime.now()
|
|
|
|
# Ensure we have exactly 120 features
|
|
if len(bom_features) != self.bom_feature_count:
|
|
if len(bom_features) > self.bom_feature_count:
|
|
bom_features = bom_features[:self.bom_feature_count]
|
|
else:
|
|
bom_features.extend([0.0] * (self.bom_feature_count - len(bom_features)))
|
|
|
|
# Convert to numpy array for efficient storage
|
|
bom_array = np.array(bom_features, dtype=np.float32)
|
|
|
|
# Add timestamp and features to cache
|
|
with self.data_lock:
|
|
self.bom_data_cache[symbol].append((current_time, bom_array))
|
|
|
|
logger.debug(f"Updated BOM cache for {symbol}: {len(self.bom_data_cache[symbol])} timestamps cached")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating BOM cache for {symbol}: {e}")
|
|
|
|
def get_bom_matrix_for_cnn(self, symbol: str, sequence_length: int = 50) -> Optional[np.ndarray]:
|
|
"""
|
|
Get BOM matrix for CNN input from cached 1s data
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'ETH/USDT')
|
|
sequence_length: Required sequence length (default 50)
|
|
|
|
Returns:
|
|
np.ndarray: BOM matrix of shape (sequence_length, 120) or None if insufficient data
|
|
"""
|
|
try:
|
|
with self.data_lock:
|
|
if symbol not in self.bom_data_cache or len(self.bom_data_cache[symbol]) == 0:
|
|
logger.warning(f"No BOM data cached for {symbol}")
|
|
return None
|
|
|
|
# Get recent data
|
|
cached_data = list(self.bom_data_cache[symbol])
|
|
|
|
if len(cached_data) < sequence_length:
|
|
logger.warning(f"Insufficient BOM data for {symbol}: {len(cached_data)} < {sequence_length}")
|
|
# Pad with zeros if we don't have enough data
|
|
bom_matrix = np.zeros((sequence_length, self.bom_feature_count), dtype=np.float32)
|
|
|
|
# Fill available data at the end
|
|
for i, (timestamp, features) in enumerate(cached_data):
|
|
if i < sequence_length:
|
|
bom_matrix[sequence_length - len(cached_data) + i] = features
|
|
|
|
return bom_matrix
|
|
|
|
# Take the most recent sequence_length samples
|
|
recent_data = cached_data[-sequence_length:]
|
|
|
|
# Create matrix
|
|
bom_matrix = np.zeros((sequence_length, self.bom_feature_count), dtype=np.float32)
|
|
for i, (timestamp, features) in enumerate(recent_data):
|
|
bom_matrix[i] = features
|
|
|
|
logger.debug(f"Retrieved BOM matrix for {symbol}: shape={bom_matrix.shape}")
|
|
return bom_matrix
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting BOM matrix for {symbol}: {e}")
|
|
return None
|
|
|
|
def get_real_bom_features(self, symbol: str) -> Optional[List[float]]:
|
|
"""
|
|
Get REAL BOM features from actual market data ONLY
|
|
|
|
NO SYNTHETIC DATA - Returns None if real data is not available
|
|
"""
|
|
try:
|
|
# Try to get real COB data from integration
|
|
if hasattr(self, 'cob_integration') and self.cob_integration:
|
|
return self._extract_real_bom_features(symbol, self.cob_integration)
|
|
|
|
# No real data available - return None instead of synthetic
|
|
logger.warning(f"No real BOM data available for {symbol} - waiting for real market data")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting real BOM features for {symbol}: {e}")
|
|
return None
|
|
|
|
def start_bom_cache_updates(self, cob_integration=None):
|
|
"""
|
|
Start background updates of BOM cache every second
|
|
|
|
Args:
|
|
cob_integration: Optional COB integration instance for real data
|
|
"""
|
|
try:
|
|
def update_loop():
|
|
while self.is_streaming:
|
|
try:
|
|
for symbol in self.symbols:
|
|
if cob_integration:
|
|
# Try to get real BOM features from COB integration
|
|
try:
|
|
bom_features = self._extract_real_bom_features(symbol, cob_integration)
|
|
if bom_features:
|
|
self.update_bom_cache(symbol, bom_features, cob_integration)
|
|
else:
|
|
# NO SYNTHETIC FALLBACK - Wait for real data
|
|
logger.warning(f"No real BOM features available for {symbol} - waiting for real data")
|
|
except Exception as e:
|
|
logger.warning(f"Error getting real BOM features for {symbol}: {e}")
|
|
logger.warning(f"Waiting for real data instead of using synthetic")
|
|
else:
|
|
# NO SYNTHETIC FEATURES - Wait for real COB integration
|
|
logger.warning(f"No COB integration available for {symbol} - waiting for real data")
|
|
|
|
time.sleep(1.0) # Update every second
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in BOM cache update loop: {e}")
|
|
time.sleep(5.0) # Wait longer on error
|
|
|
|
# Start background thread
|
|
bom_thread = Thread(target=update_loop, daemon=True)
|
|
bom_thread.start()
|
|
|
|
logger.info("Started BOM cache updates (1s resolution)")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting BOM cache updates: {e}")
|
|
|
|
def _extract_real_bom_features(self, symbol: str, cob_integration) -> Optional[List[float]]:
|
|
"""Extract real BOM features from COB integration"""
|
|
try:
|
|
features = []
|
|
|
|
# Get consolidated order book
|
|
if hasattr(cob_integration, 'get_consolidated_orderbook'):
|
|
cob_snapshot = cob_integration.get_consolidated_orderbook(symbol)
|
|
if cob_snapshot:
|
|
# Extract order book features (40 features)
|
|
features.extend(self._extract_orderbook_features(cob_snapshot))
|
|
else:
|
|
features.extend([0.0] * 40)
|
|
else:
|
|
features.extend([0.0] * 40)
|
|
|
|
# Get volume profile features (30 features)
|
|
if hasattr(cob_integration, 'get_session_volume_profile'):
|
|
volume_profile = cob_integration.get_session_volume_profile(symbol)
|
|
if volume_profile:
|
|
features.extend(self._extract_volume_profile_features(volume_profile))
|
|
else:
|
|
features.extend([0.0] * 30)
|
|
else:
|
|
features.extend([0.0] * 30)
|
|
|
|
# Add flow and microstructure features (50 features)
|
|
features.extend(self._extract_flow_microstructure_features(symbol, cob_integration))
|
|
|
|
# Ensure exactly 120 features
|
|
if len(features) > 120:
|
|
features = features[:120]
|
|
elif len(features) < 120:
|
|
features.extend([0.0] * (120 - len(features)))
|
|
|
|
return features
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting real BOM features for {symbol}: {e}")
|
|
return None
|
|
|
|
def _extract_orderbook_features(self, cob_snapshot) -> List[float]:
|
|
"""Extract order book features from COB snapshot"""
|
|
features = []
|
|
|
|
try:
|
|
# Top 10 bid levels
|
|
for i in range(10):
|
|
if i < len(cob_snapshot.consolidated_bids):
|
|
level = cob_snapshot.consolidated_bids[i]
|
|
price_offset = (level.price - cob_snapshot.volume_weighted_mid) / cob_snapshot.volume_weighted_mid
|
|
volume_normalized = level.total_volume_usd / 1000000
|
|
features.extend([price_offset, volume_normalized])
|
|
else:
|
|
features.extend([0.0, 0.0])
|
|
|
|
# Top 10 ask levels
|
|
for i in range(10):
|
|
if i < len(cob_snapshot.consolidated_asks):
|
|
level = cob_snapshot.consolidated_asks[i]
|
|
price_offset = (level.price - cob_snapshot.volume_weighted_mid) / cob_snapshot.volume_weighted_mid
|
|
volume_normalized = level.total_volume_usd / 1000000
|
|
features.extend([price_offset, volume_normalized])
|
|
else:
|
|
features.extend([0.0, 0.0])
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting order book features: {e}")
|
|
features = [0.0] * 40
|
|
|
|
return features[:40]
|
|
|
|
def _extract_volume_profile_features(self, volume_profile) -> List[float]:
|
|
"""Extract volume profile features"""
|
|
features = []
|
|
|
|
try:
|
|
if 'data' in volume_profile:
|
|
svp_data = volume_profile['data']
|
|
top_levels = sorted(svp_data, key=lambda x: x.get('total_volume', 0), reverse=True)[:10]
|
|
|
|
for level in top_levels:
|
|
buy_percent = level.get('buy_percent', 50.0) / 100.0
|
|
sell_percent = level.get('sell_percent', 50.0) / 100.0
|
|
total_volume = level.get('total_volume', 0.0) / 1000000
|
|
features.extend([buy_percent, sell_percent, total_volume])
|
|
|
|
# Pad to 30 features
|
|
while len(features) < 30:
|
|
features.extend([0.5, 0.5, 0.0])
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting volume profile features: {e}")
|
|
features = [0.0] * 30
|
|
|
|
return features[:30]
|
|
|
|
def _extract_flow_microstructure_features(self, symbol: str, cob_integration) -> List[float]:
|
|
"""Extract flow and microstructure features"""
|
|
try:
|
|
# For now, return synthetic features since full implementation would be complex
|
|
# NO SYNTHETIC DATA - Return None if no real microstructure data
|
|
logger.warning(f"No real microstructure data available for {symbol}")
|
|
return None
|
|
except:
|
|
return [0.0] * 50
|
|
|
|
def _handle_rate_limit(self, url: str):
|
|
"""Handle rate limiting with exponential backoff"""
|
|
current_time = time.time()
|
|
|
|
# Check if we need to wait
|
|
if url in self.last_request_time:
|
|
time_since_last = current_time - self.last_request_time[url]
|
|
if time_since_last < self.request_interval:
|
|
sleep_time = self.request_interval - time_since_last
|
|
logger.info(f"Rate limiting: sleeping {sleep_time:.2f}s")
|
|
time.sleep(sleep_time)
|
|
|
|
self.last_request_time[url] = time.time()
|
|
|
|
def _make_request_with_retry(self, url: str, params: dict = None):
|
|
"""Make HTTP request with retry logic for 451 errors"""
|
|
for attempt in range(self.max_retries):
|
|
try:
|
|
self._handle_rate_limit(url)
|
|
response = requests.get(url, params=params, timeout=30)
|
|
|
|
if response.status_code == 451:
|
|
logger.warning(f"Rate limit hit (451), attempt {attempt + 1}/{self.max_retries}")
|
|
if attempt < self.max_retries - 1:
|
|
sleep_time = self.retry_delay * (2 ** attempt) # Exponential backoff
|
|
logger.info(f"Waiting {sleep_time}s before retry...")
|
|
time.sleep(sleep_time)
|
|
continue
|
|
else:
|
|
logger.error("Max retries reached, using cached data")
|
|
return None
|
|
|
|
response.raise_for_status()
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Request failed (attempt {attempt + 1}): {e}")
|
|
if attempt < self.max_retries - 1:
|
|
time.sleep(5 * (attempt + 1))
|
|
|
|
return None
|
|
# === SIMPLIFIED TRAINING DATA COLLECTION ===
|
|
|
|
def start_training_data_collection(self):
|
|
"""Start simplified training data collection"""
|
|
if hasattr(self, 'training_data_collection_active') and self.training_data_collection_active:
|
|
logger.warning("Training data collection already active")
|
|
return
|
|
|
|
self.training_data_collection_active = True
|
|
self.training_data_thread = Thread(target=self._training_data_collection_worker, daemon=True)
|
|
self.training_data_thread.start()
|
|
logger.info("Training data collection started")
|
|
|
|
def stop_training_data_collection(self):
|
|
"""Stop training data collection"""
|
|
if hasattr(self, 'training_data_collection_active'):
|
|
self.training_data_collection_active = False
|
|
if hasattr(self, 'training_data_thread') and self.training_data_thread and self.training_data_thread.is_alive():
|
|
self.training_data_thread.join(timeout=5)
|
|
logger.info("Training data collection stopped")
|
|
|
|
def _training_data_collection_worker(self):
|
|
"""Simplified training data collection worker"""
|
|
logger.info("Training data collection worker started")
|
|
|
|
while getattr(self, 'training_data_collection_active', False):
|
|
try:
|
|
# Collect training data for all symbols
|
|
for symbol in self.symbols:
|
|
training_sample = self._collect_training_sample(symbol)
|
|
if training_sample:
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
self.training_data_cache[binance_symbol].append(training_sample)
|
|
|
|
# Distribute to training data subscribers
|
|
for callback in self.training_data_callbacks:
|
|
try:
|
|
callback(symbol, training_sample)
|
|
except Exception as e:
|
|
logger.error(f"Error in training data callback: {e}")
|
|
|
|
# Sleep for 10 seconds between collections
|
|
time.sleep(10)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in training data collection worker: {e}")
|
|
time.sleep(30) # Wait longer on error
|
|
|
|
def _collect_training_sample(self, symbol: str) -> Optional[dict]:
|
|
"""Collect a simplified training sample"""
|
|
try:
|
|
# Get recent OHLCV data from cache
|
|
ohlcv_data = self.get_historical_data(symbol, '1m', limit=50)
|
|
if ohlcv_data is None or len(ohlcv_data) < 10:
|
|
return None
|
|
|
|
# Get recent COB data
|
|
recent_cob = self.get_cob_1s_aggregated(symbol, count=10)
|
|
|
|
# Create simplified training sample
|
|
training_sample = {
|
|
'symbol': symbol,
|
|
'timestamp': datetime.now(),
|
|
'ohlcv_data': ohlcv_data.tail(10).to_dict('records') if not ohlcv_data.empty else [],
|
|
'cob_data': recent_cob,
|
|
'features': self._extract_simple_training_features(symbol, ohlcv_data, recent_cob)
|
|
}
|
|
|
|
return training_sample
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting training sample for {symbol}: {e}")
|
|
return None
|
|
|
|
def _extract_simple_training_features(self, symbol: str, ohlcv_data: pd.DataFrame, recent_cob: List[dict]) -> dict:
|
|
"""Extract simplified training features"""
|
|
try:
|
|
features = {}
|
|
|
|
# OHLCV features
|
|
if not ohlcv_data.empty:
|
|
latest = ohlcv_data.iloc[-1]
|
|
features.update({
|
|
'price': latest['close'],
|
|
'volume': latest['volume'],
|
|
'price_change': (latest['close'] - ohlcv_data.iloc[-2]['close']) / ohlcv_data.iloc[-2]['close'] if len(ohlcv_data) > 1 else 0,
|
|
'volatility': ohlcv_data['close'].pct_change().std() if len(ohlcv_data) > 1 else 0
|
|
})
|
|
|
|
# COB features
|
|
if recent_cob:
|
|
latest_cob = recent_cob[-1]
|
|
stats = latest_cob.get('stats', {})
|
|
features.update({
|
|
'avg_spread_bps': stats.get('avg_spread_bps', 0),
|
|
'avg_imbalance': stats.get('avg_imbalance', 0),
|
|
'total_volume': stats.get('total_volume', 0),
|
|
'bucket_count': stats.get('bid_bucket_count', 0) + stats.get('ask_bucket_count', 0)
|
|
})
|
|
|
|
return features
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting simple training features for {symbol}: {e}")
|
|
return {}
|
|
|
|
# === SUBSCRIPTION METHODS ===
|
|
|
|
def subscribe_to_training_data(self, callback: Callable[[str, dict], None]) -> str:
|
|
"""Subscribe to training data updates"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.training_data_callbacks.append(callback)
|
|
logger.info(f"Training data subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def subscribe_to_model_predictions(self, callback: Callable[[str, dict], None]) -> str:
|
|
"""Subscribe to model prediction updates"""
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.model_prediction_callbacks.append(callback)
|
|
logger.info(f"Model prediction subscriber added: {subscriber_id}")
|
|
return subscriber_id
|
|
|
|
def get_training_data(self, symbol: str, count: int = 100) -> List[dict]:
|
|
"""Get recent training data for a symbol"""
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
if binance_symbol in self.training_data_cache:
|
|
return list(self.training_data_cache[binance_symbol])[-count:]
|
|
return []
|
|
|
|
def collect_cob_data(self, symbol: str) -> dict:
|
|
"""
|
|
Collect Consolidated Order Book (COB) data for a symbol using REST API
|
|
|
|
This centralized method collects COB data for all consumers (models, dashboard, etc.)
|
|
"""
|
|
try:
|
|
import requests
|
|
import time
|
|
|
|
# Check rate limits before making request
|
|
if not self._handle_rate_limit(f"https://api.binance.com/api/v3/depth"):
|
|
logger.warning(f"Rate limited for {symbol}, using cached data")
|
|
# Return cached data if available
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
if binance_symbol in self.cob_data_cache and self.cob_data_cache[binance_symbol]:
|
|
return self.cob_data_cache[binance_symbol][-1]
|
|
return {}
|
|
|
|
# Use Binance REST API for order book data with reduced limit
|
|
binance_symbol = symbol.replace('/', '')
|
|
url = f"https://api.binance.com/api/v3/depth?symbol={binance_symbol}&limit=100" # Reduced from 500
|
|
|
|
# Add headers to reduce detection
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
|
|
# Process order book data
|
|
bids = [[float(price), float(qty)] for price, qty in data.get('bids', [])]
|
|
asks = [[float(price), float(qty)] for price, qty in data.get('asks', [])]
|
|
|
|
# Calculate mid price
|
|
best_bid = bids[0][0] if bids else 0
|
|
best_ask = asks[0][0] if asks else 0
|
|
mid_price = (best_bid + best_ask) / 2 if best_bid and best_ask else 0
|
|
|
|
# Calculate order book stats
|
|
bid_liquidity = sum(qty for _, qty in bids[:20])
|
|
ask_liquidity = sum(qty for _, qty in asks[:20])
|
|
total_liquidity = bid_liquidity + ask_liquidity
|
|
|
|
# Calculate imbalance
|
|
imbalance = (bid_liquidity - ask_liquidity) / total_liquidity if total_liquidity > 0 else 0
|
|
|
|
# Calculate spread in basis points
|
|
spread = (best_ask - best_bid) / mid_price * 10000 if mid_price > 0 else 0
|
|
|
|
# Create COB snapshot
|
|
cob_snapshot = {
|
|
'symbol': symbol,
|
|
'timestamp': int(time.time() * 1000),
|
|
'bids': bids[:50], # Limit to top 50 levels
|
|
'asks': asks[:50], # Limit to top 50 levels
|
|
'stats': {
|
|
'mid_price': mid_price,
|
|
'best_bid': best_bid,
|
|
'best_ask': best_ask,
|
|
'bid_liquidity': bid_liquidity,
|
|
'ask_liquidity': ask_liquidity,
|
|
'total_liquidity': total_liquidity,
|
|
'imbalance': imbalance,
|
|
'spread_bps': spread
|
|
}
|
|
}
|
|
|
|
# Store in cache
|
|
with self.subscriber_lock:
|
|
if not hasattr(self, 'cob_data_cache'):
|
|
self.cob_data_cache = {}
|
|
|
|
if symbol not in self.cob_data_cache:
|
|
self.cob_data_cache[symbol] = []
|
|
|
|
# Add to cache with max size limit
|
|
self.cob_data_cache[symbol].append(cob_snapshot)
|
|
if len(self.cob_data_cache[symbol]) > 300: # Keep 5 minutes of 1s data
|
|
self.cob_data_cache[symbol].pop(0)
|
|
|
|
# Notify subscribers
|
|
self._notify_cob_subscribers(symbol, cob_snapshot)
|
|
|
|
return cob_snapshot
|
|
elif response.status_code in [418, 429, 451]:
|
|
logger.warning(f"Rate limited (HTTP {response.status_code}) for {symbol}, using cached data")
|
|
# Return cached data if available
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
if binance_symbol in self.cob_data_cache and self.cob_data_cache[binance_symbol]:
|
|
return self.cob_data_cache[binance_symbol][-1]
|
|
return {}
|
|
else:
|
|
logger.warning(f"Failed to fetch COB data for {symbol}: {response.status_code}")
|
|
return {}
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error collecting COB data for {symbol}: {e}")
|
|
return {}
|
|
|
|
def start_cob_collection(self):
|
|
"""
|
|
Start enhanced COB data collection with WebSocket and raw tick aggregation
|
|
"""
|
|
try:
|
|
# Initialize COB WebSocket system
|
|
self._initialize_enhanced_cob_websocket()
|
|
|
|
# Start aggregation system
|
|
self._start_cob_tick_aggregation()
|
|
|
|
logger.info("Enhanced COB data collection started with WebSocket and tick aggregation")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting enhanced COB collection: {e}")
|
|
# Fallback to REST-only collection
|
|
self._start_rest_only_cob_collection()
|
|
|
|
def _initialize_enhanced_cob_websocket(self):
|
|
"""Initialize the enhanced COB WebSocket system"""
|
|
try:
|
|
from .enhanced_cob_websocket import EnhancedCOBWebSocket
|
|
|
|
# Initialize WebSocket with our symbols
|
|
self.enhanced_cob_websocket = EnhancedCOBWebSocket(
|
|
symbols=['ETH/USDT', 'BTC/USDT'],
|
|
dashboard_callback=self._on_cob_websocket_status
|
|
)
|
|
|
|
# Add callback for COB data
|
|
self.enhanced_cob_websocket.add_cob_callback(self._on_cob_websocket_data)
|
|
|
|
# Start WebSocket in background thread
|
|
import threading
|
|
import asyncio
|
|
|
|
def run_websocket():
|
|
"""Run WebSocket in separate thread with its own event loop"""
|
|
try:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
loop.run_until_complete(self.enhanced_cob_websocket.start())
|
|
loop.run_forever()
|
|
except Exception as e:
|
|
logger.error(f"Error in COB WebSocket thread: {e}")
|
|
|
|
websocket_thread = threading.Thread(target=run_websocket, daemon=True)
|
|
websocket_thread.start()
|
|
|
|
logger.info("Enhanced COB WebSocket initialized and started")
|
|
|
|
except ImportError:
|
|
logger.warning("Enhanced COB WebSocket not available, falling back to REST")
|
|
self._start_rest_only_cob_collection()
|
|
except Exception as e:
|
|
logger.error(f"Error initializing COB WebSocket: {e}")
|
|
self._start_rest_only_cob_collection()
|
|
|
|
def _start_cob_tick_aggregation(self):
|
|
"""Start COB tick aggregation system"""
|
|
try:
|
|
# Initialize tick storage
|
|
if not hasattr(self, 'cob_raw_ticks'):
|
|
self.cob_raw_ticks = {
|
|
'ETH/USDT': [],
|
|
'BTC/USDT': []
|
|
}
|
|
|
|
if not hasattr(self, 'cob_1s_aggregated'):
|
|
self.cob_1s_aggregated = {
|
|
'ETH/USDT': [],
|
|
'BTC/USDT': []
|
|
}
|
|
|
|
# Start aggregation thread
|
|
import threading
|
|
import time
|
|
|
|
def tick_aggregator():
|
|
"""Aggregate raw ticks into 1-second intervals"""
|
|
logger.info("Starting COB tick aggregation system")
|
|
|
|
while True:
|
|
try:
|
|
current_time = time.time()
|
|
current_second = int(current_time)
|
|
|
|
# Process each symbol
|
|
for symbol in ['ETH/USDT', 'BTC/USDT']:
|
|
self._aggregate_ticks_for_symbol(symbol, current_second)
|
|
|
|
# Sleep until next second boundary
|
|
sleep_time = 1.0 - (current_time % 1.0)
|
|
time.sleep(sleep_time)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in tick aggregation: {e}")
|
|
time.sleep(1)
|
|
|
|
aggregation_thread = threading.Thread(target=tick_aggregator, daemon=True)
|
|
aggregation_thread.start()
|
|
|
|
logger.info("COB tick aggregation system started")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting tick aggregation: {e}")
|
|
|
|
def _start_rest_only_cob_collection(self):
|
|
"""Fallback to REST-only COB collection"""
|
|
try:
|
|
import threading
|
|
import time
|
|
|
|
def cob_collector():
|
|
"""Collect COB data using REST API calls"""
|
|
logger.info("Starting REST-only COB data collection")
|
|
while True:
|
|
try:
|
|
# Collect data for both symbols
|
|
for symbol in ['ETH/USDT', 'BTC/USDT']:
|
|
self.collect_cob_data(symbol)
|
|
|
|
# Sleep for 1 second between collections
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
logger.debug(f"Error in COB collection: {e}")
|
|
time.sleep(5) # Wait longer on error
|
|
|
|
# Start collector in background thread
|
|
if not hasattr(self, '_cob_thread_started') or not self._cob_thread_started:
|
|
cob_thread = threading.Thread(target=cob_collector, daemon=True)
|
|
cob_thread.start()
|
|
self._cob_thread_started = True
|
|
logger.info("REST-only COB data collection started")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting REST-only COB collection: {e}")
|
|
|
|
async def _on_cob_websocket_data(self, symbol: str, cob_data: dict):
|
|
"""Handle COB data from WebSocket (100+ updates per second)"""
|
|
try:
|
|
import time
|
|
|
|
# Add timestamp if not present
|
|
if 'timestamp' not in cob_data:
|
|
cob_data['timestamp'] = time.time()
|
|
elif hasattr(cob_data['timestamp'], 'timestamp'):
|
|
# Convert datetime to timestamp
|
|
cob_data['timestamp'] = cob_data['timestamp'].timestamp()
|
|
|
|
# Store raw tick - ensure proper initialization
|
|
if not hasattr(self, 'cob_raw_ticks'):
|
|
self.cob_raw_ticks = {}
|
|
if not hasattr(self, 'cob_1s_aggregated'):
|
|
self.cob_1s_aggregated = {}
|
|
|
|
# Ensure symbol keys exist in the dictionary with proper deque initialization
|
|
for sym in ['ETH/USDT', 'BTC/USDT']:
|
|
if sym not in self.cob_raw_ticks:
|
|
# Use deque with maxlen for automatic size management (15 min at ~100 ticks/sec)
|
|
self.cob_raw_ticks[sym] = deque(maxlen=90000)
|
|
if sym not in self.cob_1s_aggregated:
|
|
# 1s aggregated: 15 minutes = 900 seconds
|
|
self.cob_1s_aggregated[sym] = deque(maxlen=900)
|
|
|
|
# Add to raw ticks - deque automatically handles size limit with maxlen
|
|
self.cob_raw_ticks[symbol].append(cob_data)
|
|
|
|
# Update latest data cache for immediate access
|
|
with self.subscriber_lock:
|
|
if not hasattr(self, 'cob_data_cache'):
|
|
self.cob_data_cache = {}
|
|
|
|
# Ensure symbol key exists in the cache
|
|
if symbol not in self.cob_data_cache:
|
|
self.cob_data_cache[symbol] = []
|
|
|
|
# Convert WebSocket format to standard format
|
|
standard_cob_data = {
|
|
'symbol': symbol,
|
|
'timestamp': int(cob_data['timestamp'] * 1000), # Convert to milliseconds
|
|
'bids': [[bid['price'], bid['size']] for bid in cob_data.get('bids', [])[:50]],
|
|
'asks': [[ask['price'], ask['size']] for ask in cob_data.get('asks', [])[:50]],
|
|
'stats': cob_data.get('stats', {})
|
|
}
|
|
|
|
# Add to cache
|
|
if symbol not in self.cob_data_cache:
|
|
self.cob_data_cache[symbol] = []
|
|
elif not isinstance(self.cob_data_cache[symbol], (list, deque)):
|
|
self.cob_data_cache[symbol] = []
|
|
self.cob_data_cache[symbol].append(standard_cob_data)
|
|
if len(self.cob_data_cache[symbol]) > 300: # Keep 5 minutes
|
|
self.cob_data_cache[symbol].pop(0)
|
|
|
|
# Notify subscribers
|
|
self._notify_cob_subscribers(symbol, standard_cob_data)
|
|
|
|
logger.debug(f"Processed WebSocket COB tick for {symbol}: {len(cob_data.get('bids', []))} bids, {len(cob_data.get('asks', []))} asks")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing WebSocket COB data for {symbol}: {e}", exc_info=True)
|
|
|
|
def _on_cob_websocket_status(self, status_data: dict):
|
|
"""Handle WebSocket status updates"""
|
|
try:
|
|
symbol = status_data.get('symbol')
|
|
status = status_data.get('status')
|
|
message = status_data.get('message', '')
|
|
|
|
logger.info(f"COB WebSocket status for {symbol}: {status} - {message}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error handling WebSocket status: {e}")
|
|
|
|
def _aggregate_ticks_for_symbol(self, symbol: str, current_second: int):
|
|
"""Aggregate raw ticks for a symbol into 1-second intervals"""
|
|
try:
|
|
if not hasattr(self, 'cob_raw_ticks') or symbol not in self.cob_raw_ticks:
|
|
return
|
|
|
|
# Get ticks for the previous second
|
|
target_second = current_second - 1
|
|
target_ticks = []
|
|
|
|
# Filter ticks for the target second
|
|
for tick in self.cob_raw_ticks[symbol]:
|
|
tick_time = tick.get('timestamp', 0)
|
|
if isinstance(tick_time, (int, float)):
|
|
tick_second = int(tick_time)
|
|
if tick_second == target_second:
|
|
target_ticks.append(tick)
|
|
|
|
if not target_ticks:
|
|
return
|
|
|
|
# Aggregate the ticks
|
|
aggregated_data = self._create_1s_aggregation(symbol, target_ticks, target_second)
|
|
|
|
# Store aggregated data
|
|
if not hasattr(self, 'cob_1s_aggregated'):
|
|
self.cob_1s_aggregated = {'ETH/USDT': [], 'BTC/USDT': []}
|
|
|
|
self.cob_1s_aggregated[symbol].append(aggregated_data)
|
|
|
|
# Note: deque with maxlen automatically handles size limit, no manual trimming needed
|
|
|
|
logger.debug(f"Aggregated {len(target_ticks)} ticks for {symbol} at second {target_second}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error aggregating ticks for {symbol}: {e}")
|
|
|
|
def _create_1s_aggregation(self, symbol: str, ticks: list, timestamp: int) -> dict:
|
|
"""Create 1-second aggregation from raw ticks"""
|
|
try:
|
|
if not ticks:
|
|
return {}
|
|
|
|
# Get first and last tick for open/close
|
|
first_tick = ticks[0]
|
|
last_tick = ticks[-1]
|
|
|
|
# Extract price data
|
|
prices = []
|
|
volumes = []
|
|
spreads = []
|
|
imbalances = []
|
|
|
|
best_bids = []
|
|
best_asks = []
|
|
|
|
for tick in ticks:
|
|
stats = tick.get('stats', {})
|
|
if stats:
|
|
mid_price = stats.get('mid_price', 0)
|
|
if mid_price > 0:
|
|
prices.append(mid_price)
|
|
|
|
# Volume data
|
|
bid_vol = stats.get('bid_volume', 0)
|
|
ask_vol = stats.get('ask_volume', 0)
|
|
total_vol = bid_vol + ask_vol
|
|
if total_vol > 0:
|
|
volumes.append(total_vol)
|
|
|
|
# Spread data
|
|
spread_bps = stats.get('spread_bps', 0)
|
|
if spread_bps > 0:
|
|
spreads.append(spread_bps)
|
|
|
|
# Imbalance data
|
|
imbalance = stats.get('imbalance', 0)
|
|
imbalances.append(imbalance)
|
|
|
|
# Best bid/ask
|
|
best_bid = stats.get('best_bid', 0)
|
|
best_ask = stats.get('best_ask', 0)
|
|
if best_bid > 0:
|
|
best_bids.append(best_bid)
|
|
if best_ask > 0:
|
|
best_asks.append(best_ask)
|
|
|
|
# Calculate OHLC for prices
|
|
if prices:
|
|
open_price = prices[0]
|
|
close_price = prices[-1]
|
|
high_price = max(prices)
|
|
low_price = min(prices)
|
|
else:
|
|
open_price = close_price = high_price = low_price = 0
|
|
|
|
# Calculate aggregated metrics
|
|
avg_volume = sum(volumes) / len(volumes) if volumes else 0
|
|
avg_spread = sum(spreads) / len(spreads) if spreads else 0
|
|
avg_imbalance = sum(imbalances) / len(imbalances) if imbalances else 0
|
|
|
|
# Best bid/ask aggregation
|
|
avg_best_bid = sum(best_bids) / len(best_bids) if best_bids else 0
|
|
avg_best_ask = sum(best_asks) / len(best_asks) if best_asks else 0
|
|
|
|
# Order book depth aggregation
|
|
total_bid_levels = 0
|
|
total_ask_levels = 0
|
|
total_bid_liquidity = 0
|
|
total_ask_liquidity = 0
|
|
|
|
for tick in ticks:
|
|
stats = tick.get('stats', {})
|
|
total_bid_levels += stats.get('bid_levels', 0)
|
|
total_ask_levels += stats.get('ask_levels', 0)
|
|
total_bid_liquidity += stats.get('bid_volume', 0)
|
|
total_ask_liquidity += stats.get('ask_volume', 0)
|
|
|
|
avg_bid_levels = total_bid_levels / len(ticks) if ticks else 0
|
|
avg_ask_levels = total_ask_levels / len(ticks) if ticks else 0
|
|
avg_bid_liquidity = total_bid_liquidity / len(ticks) if ticks else 0
|
|
avg_ask_liquidity = total_ask_liquidity / len(ticks) if ticks else 0
|
|
|
|
# Create aggregated data structure
|
|
aggregated = {
|
|
'symbol': symbol,
|
|
'timestamp': timestamp,
|
|
'tick_count': len(ticks),
|
|
'price_ohlc': {
|
|
'open': open_price,
|
|
'high': high_price,
|
|
'low': low_price,
|
|
'close': close_price
|
|
},
|
|
'volume': {
|
|
'average': avg_volume,
|
|
'total_bid': total_bid_liquidity,
|
|
'total_ask': total_ask_liquidity,
|
|
'average_bid': avg_bid_liquidity,
|
|
'average_ask': avg_ask_liquidity
|
|
},
|
|
'spread': {
|
|
'average_bps': avg_spread,
|
|
'min_bps': min(spreads) if spreads else 0,
|
|
'max_bps': max(spreads) if spreads else 0
|
|
},
|
|
'imbalance': {
|
|
'average': avg_imbalance,
|
|
'min': min(imbalances) if imbalances else 0,
|
|
'max': max(imbalances) if imbalances else 0
|
|
},
|
|
'depth': {
|
|
'average_bid_levels': avg_bid_levels,
|
|
'average_ask_levels': avg_ask_levels,
|
|
'total_levels': avg_bid_levels + avg_ask_levels
|
|
},
|
|
'best_prices': {
|
|
'average_bid': avg_best_bid,
|
|
'average_ask': avg_best_ask,
|
|
'average_mid': (avg_best_bid + avg_best_ask) / 2 if (avg_best_bid > 0 and avg_best_ask > 0) else 0
|
|
},
|
|
'raw_tick_data': {
|
|
'first_tick_time': first_tick.get('timestamp', 0),
|
|
'last_tick_time': last_tick.get('timestamp', 0),
|
|
'source': first_tick.get('source', 'unknown')
|
|
}
|
|
}
|
|
|
|
return aggregated
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating 1s aggregation for {symbol}: {e}")
|
|
return {}
|
|
|
|
def _notify_cob_subscribers(self, symbol: str, cob_snapshot: dict):
|
|
"""Notify subscribers of new COB data"""
|
|
with self.subscriber_lock:
|
|
if not hasattr(self, 'cob_subscribers'):
|
|
self.cob_subscribers = {}
|
|
|
|
# Notify all subscribers for this symbol
|
|
for subscriber_id, callback in self.cob_subscribers.items():
|
|
try:
|
|
callback(symbol, cob_snapshot)
|
|
except Exception as e:
|
|
logger.debug(f"Error notifying COB subscriber {subscriber_id}: {e}")
|
|
|
|
def subscribe_to_cob(self, callback) -> str:
|
|
"""Subscribe to COB data updates"""
|
|
with self.subscriber_lock:
|
|
if not hasattr(self, 'cob_subscribers'):
|
|
self.cob_subscribers = {}
|
|
|
|
subscriber_id = str(uuid.uuid4())
|
|
self.cob_subscribers[subscriber_id] = callback
|
|
|
|
# Start collection if not already started
|
|
self.start_cob_collection()
|
|
|
|
return subscriber_id
|
|
|
|
def get_latest_cob_data(self, symbol: str) -> dict:
|
|
"""Get latest COB data for a symbol"""
|
|
with self.subscriber_lock:
|
|
# Use the original symbol format for cache lookup (matches how data is stored)
|
|
logger.debug(f"Getting COB data for {symbol}")
|
|
|
|
if not hasattr(self, 'cob_data_cache'):
|
|
logger.debug("COB data cache not initialized")
|
|
return {}
|
|
|
|
if symbol not in self.cob_data_cache:
|
|
logger.debug(f"Symbol {symbol} not in COB cache. Available: {list(self.cob_data_cache.keys())}")
|
|
return {}
|
|
|
|
if not self.cob_data_cache[symbol]:
|
|
logger.debug(f"COB cache for {symbol} is empty")
|
|
return {}
|
|
|
|
latest_data = self.cob_data_cache[symbol][-1]
|
|
logger.debug(f"Latest COB data type for {symbol}: {type(latest_data)}")
|
|
return latest_data
|
|
|
|
def get_cob_raw_ticks(self, symbol: str, count: int = 100) -> List[dict]:
|
|
"""Get raw COB ticks for a symbol (100+ updates per second)"""
|
|
try:
|
|
if not hasattr(self, 'cob_raw_ticks') or symbol not in self.cob_raw_ticks:
|
|
return []
|
|
|
|
# Return the most recent 'count' ticks
|
|
return list(self.cob_raw_ticks[symbol])[-count:]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting raw COB ticks for {symbol}: {e}")
|
|
return []
|
|
|
|
def get_cob_1s_aggregated(self, symbol: str, count: int = 60) -> List[dict]:
|
|
"""Get 1-second aggregated COB data for a symbol"""
|
|
try:
|
|
if not hasattr(self, 'cob_1s_aggregated') or symbol not in self.cob_1s_aggregated:
|
|
return []
|
|
|
|
# Return the most recent 'count' 1-second aggregations
|
|
return list(self.cob_1s_aggregated[symbol])[-count:]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting 1s aggregated COB data for {symbol}: {e}")
|
|
return []
|
|
|
|
def get_combined_ohlcv_cob_data(self, symbol: str, timeframe: str = '1s', count: int = 60) -> dict:
|
|
"""
|
|
Get combined OHLCV and COB data for model inputs
|
|
|
|
Returns:
|
|
dict: {
|
|
'ohlcv': DataFrame with OHLCV data,
|
|
'cob_1s': List of 1-second aggregated COB data,
|
|
'cob_raw_ticks': List of raw COB ticks,
|
|
'timestamps_aligned': bool
|
|
}
|
|
"""
|
|
try:
|
|
# Get OHLCV data
|
|
ohlcv_data = self.get_historical_data(symbol, timeframe, limit=count, refresh=True)
|
|
|
|
# Get COB data
|
|
cob_1s_data = self.get_cob_1s_aggregated(symbol, count)
|
|
cob_raw_ticks = self.get_cob_raw_ticks(symbol, count * 10) # More raw ticks
|
|
|
|
# Check timestamp alignment
|
|
timestamps_aligned = False
|
|
if ohlcv_data is not None and cob_1s_data:
|
|
try:
|
|
# Get latest timestamps
|
|
latest_ohlcv_time = ohlcv_data.index[-1].timestamp() if hasattr(ohlcv_data.index[-1], 'timestamp') else 0
|
|
latest_cob_time = cob_1s_data[-1].get('timestamp', 0)
|
|
|
|
# Check if timestamps are within 5 seconds of each other
|
|
time_diff = abs(latest_ohlcv_time - latest_cob_time)
|
|
timestamps_aligned = time_diff <= 5
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error checking timestamp alignment: {e}")
|
|
|
|
result = {
|
|
'symbol': symbol,
|
|
'timeframe': timeframe,
|
|
'ohlcv': ohlcv_data,
|
|
'cob_1s': cob_1s_data,
|
|
'cob_raw_ticks': cob_raw_ticks,
|
|
'timestamps_aligned': timestamps_aligned,
|
|
'ohlcv_count': len(ohlcv_data) if ohlcv_data is not None else 0,
|
|
'cob_1s_count': len(cob_1s_data),
|
|
'cob_raw_count': len(cob_raw_ticks),
|
|
'data_quality': self._assess_data_quality(ohlcv_data, cob_1s_data, cob_raw_ticks)
|
|
}
|
|
|
|
logger.debug(f"Combined data for {symbol}: OHLCV={result['ohlcv_count']}, COB_1s={result['cob_1s_count']}, COB_raw={result['cob_raw_count']}, aligned={timestamps_aligned}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting combined OHLCV+COB data for {symbol}: {e}")
|
|
return {
|
|
'symbol': symbol,
|
|
'timeframe': timeframe,
|
|
'ohlcv': None,
|
|
'cob_1s': [],
|
|
'cob_raw_ticks': [],
|
|
'timestamps_aligned': False,
|
|
'ohlcv_count': 0,
|
|
'cob_1s_count': 0,
|
|
'cob_raw_count': 0,
|
|
'data_quality': 'error'
|
|
}
|
|
|
|
def _assess_data_quality(self, ohlcv_data, cob_1s_data, cob_raw_ticks) -> str:
|
|
"""Assess the quality of combined data"""
|
|
try:
|
|
# Check if we have all data types
|
|
has_ohlcv = ohlcv_data is not None and not ohlcv_data.empty
|
|
has_cob_1s = len(cob_1s_data) > 0
|
|
has_cob_raw = len(cob_raw_ticks) > 0
|
|
|
|
if has_ohlcv and has_cob_1s and has_cob_raw:
|
|
# Check data freshness (within last 60 seconds)
|
|
import time
|
|
current_time = time.time()
|
|
|
|
# Check OHLCV freshness
|
|
ohlcv_fresh = False
|
|
if has_ohlcv:
|
|
try:
|
|
latest_ohlcv_time = ohlcv_data.index[-1].timestamp()
|
|
ohlcv_fresh = (current_time - latest_ohlcv_time) <= 60
|
|
except:
|
|
pass
|
|
|
|
# Check COB freshness
|
|
cob_fresh = False
|
|
if has_cob_1s:
|
|
try:
|
|
latest_cob_time = cob_1s_data[-1].get('timestamp', 0)
|
|
cob_fresh = (current_time - latest_cob_time) <= 60
|
|
except:
|
|
pass
|
|
|
|
if ohlcv_fresh and cob_fresh:
|
|
return 'excellent'
|
|
elif has_ohlcv and has_cob_1s:
|
|
return 'good'
|
|
else:
|
|
return 'fair'
|
|
elif has_ohlcv and has_cob_1s:
|
|
return 'good'
|
|
elif has_ohlcv or has_cob_1s:
|
|
return 'limited'
|
|
else:
|
|
return 'poor'
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error assessing data quality: {e}")
|
|
return 'unknown'
|
|
|
|
def get_model_input_features(self, symbol: str, feature_count: int = 100) -> dict:
|
|
"""
|
|
Get comprehensive model input features combining OHLCV and COB data
|
|
|
|
Returns:
|
|
dict: {
|
|
'features': numpy array of shape (feature_count,),
|
|
'feature_names': list of feature names,
|
|
'timestamp': latest timestamp,
|
|
'data_sources': list of data sources used
|
|
}
|
|
"""
|
|
try:
|
|
import numpy as np
|
|
|
|
# Get combined data
|
|
combined_data = self.get_combined_ohlcv_cob_data(symbol, '1s', count=60)
|
|
|
|
features = []
|
|
feature_names = []
|
|
data_sources = []
|
|
|
|
# OHLCV features (40 features)
|
|
if combined_data['ohlcv'] is not None and not combined_data['ohlcv'].empty:
|
|
ohlcv_df = combined_data['ohlcv'].tail(20) # Last 20 seconds
|
|
data_sources.append('ohlcv')
|
|
|
|
# Price features (20 features)
|
|
for i, (_, row) in enumerate(ohlcv_df.iterrows()):
|
|
if len(features) < 20:
|
|
features.extend([
|
|
row.get('close', 0) / 100000, # Normalized price
|
|
row.get('volume', 0) / 1000000, # Normalized volume
|
|
])
|
|
feature_names.extend([f'ohlcv_close_{i}', f'ohlcv_volume_{i}'])
|
|
|
|
# Technical indicators (20 features)
|
|
if len(ohlcv_df) > 0:
|
|
latest_row = ohlcv_df.iloc[-1]
|
|
tech_features = [
|
|
latest_row.get('sma_10', 0) / 100000,
|
|
latest_row.get('sma_20', 0) / 100000,
|
|
latest_row.get('ema_12', 0) / 100000,
|
|
latest_row.get('ema_26', 0) / 100000,
|
|
latest_row.get('rsi', 50) / 100,
|
|
latest_row.get('macd', 0) / 1000,
|
|
latest_row.get('bb_upper', 0) / 100000,
|
|
latest_row.get('bb_lower', 0) / 100000,
|
|
latest_row.get('atr', 0) / 1000,
|
|
latest_row.get('adx', 0) / 100,
|
|
]
|
|
# Pad to 20 features
|
|
tech_features.extend([0.0] * (20 - len(tech_features)))
|
|
features.extend(tech_features[:20])
|
|
feature_names.extend([f'tech_{i}' for i in range(20)])
|
|
else:
|
|
# Pad with zeros if no OHLCV data
|
|
features.extend([0.0] * 40)
|
|
feature_names.extend([f'ohlcv_missing_{i}' for i in range(40)])
|
|
|
|
# COB 1s aggregated features (40 features)
|
|
if combined_data['cob_1s']:
|
|
data_sources.append('cob_1s')
|
|
cob_1s_data = combined_data['cob_1s'][-20:] # Last 20 seconds
|
|
|
|
for i, cob_data in enumerate(cob_1s_data):
|
|
if len(features) < 80: # 40 OHLCV + 40 COB
|
|
price_ohlc = cob_data.get('price_ohlc', {})
|
|
volume_data = cob_data.get('volume', {})
|
|
|
|
features.extend([
|
|
price_ohlc.get('close', 0) / 100000, # Normalized close price
|
|
volume_data.get('average', 0) / 1000000, # Normalized volume
|
|
])
|
|
feature_names.extend([f'cob_1s_close_{i}', f'cob_1s_volume_{i}'])
|
|
else:
|
|
# Pad with zeros if no COB 1s data
|
|
features.extend([0.0] * 40)
|
|
feature_names.extend([f'cob_1s_missing_{i}' for i in range(40)])
|
|
|
|
# COB raw tick features (20 features)
|
|
if combined_data['cob_raw_ticks']:
|
|
data_sources.append('cob_raw')
|
|
raw_ticks = combined_data['cob_raw_ticks'][-100:] # Last 100 ticks
|
|
|
|
# Aggregate raw tick statistics
|
|
if raw_ticks:
|
|
spreads = []
|
|
imbalances = []
|
|
volumes = []
|
|
|
|
for tick in raw_ticks:
|
|
stats = tick.get('stats', {})
|
|
if stats:
|
|
spreads.append(stats.get('spread_bps', 0))
|
|
imbalances.append(stats.get('imbalance', 0))
|
|
volumes.append(stats.get('bid_volume', 0) + stats.get('ask_volume', 0))
|
|
|
|
# Statistical features from raw ticks
|
|
raw_features = [
|
|
np.mean(spreads) / 100 if spreads else 0, # Average spread
|
|
np.std(spreads) / 100 if spreads else 0, # Spread volatility
|
|
np.mean(imbalances) if imbalances else 0, # Average imbalance
|
|
np.std(imbalances) if imbalances else 0, # Imbalance volatility
|
|
np.mean(volumes) / 1000000 if volumes else 0, # Average volume
|
|
len(raw_ticks) / 100, # Tick frequency (normalized)
|
|
]
|
|
# Pad to 20 features
|
|
raw_features.extend([0.0] * (20 - len(raw_features)))
|
|
features.extend(raw_features[:20])
|
|
feature_names.extend([f'cob_raw_{i}' for i in range(20)])
|
|
else:
|
|
features.extend([0.0] * 20)
|
|
feature_names.extend([f'cob_raw_empty_{i}' for i in range(20)])
|
|
else:
|
|
# Pad with zeros if no raw tick data
|
|
features.extend([0.0] * 20)
|
|
feature_names.extend([f'cob_raw_missing_{i}' for i in range(20)])
|
|
|
|
# Ensure we have exactly the requested number of features
|
|
if len(features) > feature_count:
|
|
features = features[:feature_count]
|
|
feature_names = feature_names[:feature_count]
|
|
elif len(features) < feature_count:
|
|
padding_needed = feature_count - len(features)
|
|
features.extend([0.0] * padding_needed)
|
|
feature_names.extend([f'padding_{i}' for i in range(padding_needed)])
|
|
|
|
# Get latest timestamp
|
|
latest_timestamp = 0
|
|
if combined_data['ohlcv'] is not None and not combined_data['ohlcv'].empty:
|
|
try:
|
|
latest_timestamp = combined_data['ohlcv'].index[-1].timestamp()
|
|
except:
|
|
pass
|
|
elif combined_data['cob_1s']:
|
|
try:
|
|
latest_timestamp = combined_data['cob_1s'][-1].get('timestamp', 0)
|
|
except:
|
|
pass
|
|
|
|
result = {
|
|
'features': np.array(features, dtype=np.float32),
|
|
'feature_names': feature_names,
|
|
'timestamp': latest_timestamp,
|
|
'data_sources': data_sources,
|
|
'data_quality': combined_data['data_quality'],
|
|
'feature_count': len(features)
|
|
}
|
|
|
|
logger.debug(f"Generated {len(features)} model features for {symbol} from sources: {data_sources}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating model input features for {symbol}: {e}")
|
|
return {
|
|
'features': np.zeros(feature_count, dtype=np.float32),
|
|
'feature_names': [f'error_{i}' for i in range(feature_count)],
|
|
'timestamp': 0,
|
|
'data_sources': [],
|
|
'data_quality': 'error',
|
|
'feature_count': feature_count
|
|
}
|
|
|
|
def get_cob_data(self, symbol: str, count: int = 50) -> List[dict]:
|
|
"""Get recent COB data for a symbol"""
|
|
with self.subscriber_lock:
|
|
# Use the original symbol format for cache lookup (matches how data is stored)
|
|
if not hasattr(self, 'cob_data_cache') or symbol not in self.cob_data_cache:
|
|
return []
|
|
|
|
# Return the most recent 'count' snapshots
|
|
return list(self.cob_data_cache[symbol])[-count:]
|
|
|
|
def get_data_summary(self) -> dict:
|
|
"""Get summary of all collected data"""
|
|
summary = {
|
|
'symbols': self.symbols,
|
|
'subscribers': {
|
|
'tick_subscribers': len(self.subscribers),
|
|
'cob_subscribers': len(self.cob_data_callbacks),
|
|
'training_subscribers': len(self.training_data_callbacks),
|
|
'prediction_subscribers': len(self.model_prediction_callbacks)
|
|
},
|
|
'data_counts': {},
|
|
'collection_status': {
|
|
'cob_collection': self.cob_collection_active,
|
|
'training_collection': self.training_data_collection_active,
|
|
'streaming': self.is_streaming
|
|
}
|
|
}
|
|
|
|
# Add data counts for each symbol
|
|
for symbol in self.symbols:
|
|
binance_symbol = symbol.replace('/', '').upper()
|
|
summary['data_counts'][symbol] = {
|
|
'ticks': len(self.tick_buffers.get(binance_symbol, [])),
|
|
'cob_snapshots': len(self.cob_data_cache.get(binance_symbol, [])),
|
|
'training_samples': len(self.training_data_cache.get(binance_symbol, []))
|
|
}
|
|
|
|
return summary
|
|
|
|
def _update_price_buckets(self, symbol: str, cob_data: Dict):
|
|
"""Update price-level buckets based on new COB data."""
|
|
try:
|
|
bids = cob_data.get('bids', [])
|
|
asks = cob_data.get('asks', [])
|
|
|
|
for size in self.bucket_sizes:
|
|
bid_buckets = self._calculate_buckets(bids, size)
|
|
ask_buckets = self._calculate_buckets(asks, size)
|
|
|
|
bucketed_data = {
|
|
'symbol': symbol,
|
|
'timestamp': datetime.now(),
|
|
'bucket_size': size,
|
|
'bids': bid_buckets,
|
|
'asks': ask_buckets
|
|
}
|
|
|
|
if symbol not in self.bucketed_cob_data:
|
|
self.bucketed_cob_data[symbol] = {}
|
|
self.bucketed_cob_data[symbol][size] = bucketed_data
|
|
|
|
# Distribute to subscribers
|
|
self._distribute_bucketed_data(symbol, size, bucketed_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating price buckets for {symbol}: {e}")
|
|
|
|
def _calculate_buckets(self, levels: List[Dict], bucket_size: int) -> Dict[float, float]:
|
|
"""Calculates aggregated volume for price buckets."""
|
|
buckets = {}
|
|
for level in levels:
|
|
price = level.get('price', 0)
|
|
volume = level.get('volume', 0)
|
|
if price > 0 and volume > 0:
|
|
bucket = math.floor(price / bucket_size) * bucket_size
|
|
if bucket not in buckets:
|
|
buckets[bucket] = 0
|
|
buckets[bucket] += volume
|
|
return buckets
|
|
|
|
def subscribe_to_bucketed_cob(self, bucket_size: int, callback: Callable):
|
|
"""Subscribe to bucketed COB data."""
|
|
if bucket_size in self.bucketed_cob_callbacks:
|
|
self.bucketed_cob_callbacks[bucket_size].append(callback)
|
|
logger.info(f"New subscriber for ${bucket_size} bucketed COB data.")
|
|
else:
|
|
logger.warning(f"Bucket size {bucket_size} not supported.")
|
|
|
|
def _distribute_bucketed_data(self, symbol: str, bucket_size: int, data: Dict):
|
|
"""Distribute bucketed data to subscribers."""
|
|
if bucket_size in self.bucketed_cob_callbacks:
|
|
for callback in self.bucketed_cob_callbacks[bucket_size]:
|
|
try:
|
|
callback(symbol, data)
|
|
except Exception as e:
|
|
logger.error(f"Error in bucketed COB callback: {e}") |