""" Historical Data Loader - Integrates with existing DataProvider Provides data loading and caching for the annotation UI, ensuring the same data quality and structure used by training and inference systems. """ import logging from typing import Dict, List, Optional, Tuple from datetime import datetime, timedelta import pandas as pd from pathlib import Path import pickle logger = logging.getLogger(__name__) class HistoricalDataLoader: """ Loads historical data from the main system's DataProvider Ensures consistency with training/inference data """ def __init__(self, data_provider): """ Initialize with existing DataProvider Args: data_provider: Instance of core.data_provider.DataProvider """ self.data_provider = data_provider self.cache_dir = Path("ANNOTATE/data/cache") self.cache_dir.mkdir(parents=True, exist_ok=True) # Cache for recently loaded data self.memory_cache = {} self.cache_ttl = timedelta(minutes=5) # Startup mode - allow stale cache for faster loading self.startup_mode = True logger.info("HistoricalDataLoader initialized with existing DataProvider (startup mode: ON)") def get_data(self, symbol: str, timeframe: str, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 500) -> Optional[pd.DataFrame]: """ Get historical data for symbol and timeframe Args: symbol: Trading pair (e.g., 'ETH/USDT') timeframe: Timeframe (e.g., '1s', '1m', '1h', '1d') start_time: Start time for data range end_time: End time for data range limit: Maximum number of candles to return Returns: DataFrame with OHLCV data or None if unavailable """ # Check memory cache first cache_key = f"{symbol}_{timeframe}_{start_time}_{end_time}_{limit}" if cache_key in self.memory_cache: cached_data, cached_time = self.memory_cache[cache_key] if datetime.now() - cached_time < self.cache_ttl: logger.debug(f"Returning cached data for {symbol} {timeframe}") return cached_data try: # Try unified storage first if available if hasattr(self.data_provider, 'is_unified_storage_enabled') and \ self.data_provider.is_unified_storage_enabled(): try: import asyncio # Get data from unified storage loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # If we have a specific time range, get historical data if start_time or end_time: target_time = end_time if end_time else start_time inference_data = loop.run_until_complete( self.data_provider.get_inference_data_unified( symbol, timestamp=target_time, context_window_minutes=60 ) ) else: # Get latest real-time data inference_data = loop.run_until_complete( self.data_provider.get_inference_data_unified(symbol) ) # Extract the requested timeframe df = inference_data.get_timeframe_data(timeframe) if df is not None and not df.empty: # Limit number of candles if len(df) > limit: df = df.tail(limit) # Cache in memory self.memory_cache[cache_key] = (df.copy(), datetime.now()) logger.info(f"Loaded {len(df)} candles from unified storage for {symbol} {timeframe}") return df except Exception as e: logger.debug(f"Unified storage not available, falling back to cached data: {e}") # Fallback to existing cached data method # Use DataProvider's cached data if available if hasattr(self.data_provider, 'cached_data'): if symbol in self.data_provider.cached_data: if timeframe in self.data_provider.cached_data[symbol]: df = self.data_provider.cached_data[symbol][timeframe] if df is not None and not df.empty: # Filter by time range if specified if start_time or end_time: df = self._filter_by_time_range(df, start_time, end_time) # Limit number of candles if len(df) > limit: df = df.tail(limit) # Cache in memory self.memory_cache[cache_key] = (df.copy(), datetime.now()) logger.info(f"Loaded {len(df)} candles for {symbol} {timeframe}") return df # Fallback: fetch from DataProvider's historical data method # During startup, allow stale cache to avoid slow API calls if self.startup_mode: logger.info(f"Loading data for {symbol} {timeframe} (startup mode: allow stale cache)") df = self.data_provider.get_historical_data( symbol=symbol, timeframe=timeframe, limit=limit, allow_stale_cache=True ) else: logger.info(f"Fetching fresh data for {symbol} {timeframe}") df = self.data_provider.get_historical_data( symbol=symbol, timeframe=timeframe, limit=limit ) if df is not None and not df.empty: # Filter by time range if specified if start_time or end_time: df = self._filter_by_time_range(df, start_time, end_time) # Cache in memory self.memory_cache[cache_key] = (df.copy(), datetime.now()) logger.info(f"Fetched {len(df)} candles for {symbol} {timeframe}") return df logger.warning(f"No data available for {symbol} {timeframe}") return None except Exception as e: logger.error(f"Error loading data for {symbol} {timeframe}: {e}") return None def _filter_by_time_range(self, df: pd.DataFrame, start_time: Optional[datetime], end_time: Optional[datetime]) -> pd.DataFrame: """Filter DataFrame by time range""" if start_time: df = df[df.index >= start_time] if end_time: df = df[df.index <= end_time] return df def get_multi_timeframe_data(self, symbol: str, timeframes: List[str], start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 500) -> Dict[str, pd.DataFrame]: """ Get data for multiple timeframes at once Args: symbol: Trading pair timeframes: List of timeframes start_time: Start time for data range end_time: End time for data range limit: Maximum number of candles per timeframe Returns: Dictionary mapping timeframe to DataFrame """ result = {} for timeframe in timeframes: df = self.get_data( symbol=symbol, timeframe=timeframe, start_time=start_time, end_time=end_time, limit=limit ) if df is not None: result[timeframe] = df logger.info(f"Loaded data for {len(result)}/{len(timeframes)} timeframes") return result def prefetch_data(self, symbol: str, timeframes: List[str], limit: int = 1000): """ Prefetch data for smooth scrolling Args: symbol: Trading pair timeframes: List of timeframes to prefetch limit: Number of candles to prefetch """ logger.info(f"Prefetching data for {symbol}: {timeframes}") for timeframe in timeframes: self.get_data(symbol, timeframe, limit=limit) def clear_cache(self): """Clear memory cache""" self.memory_cache.clear() logger.info("Memory cache cleared") def disable_startup_mode(self): """Disable startup mode to fetch fresh data""" self.startup_mode = False logger.info("Startup mode disabled - will fetch fresh data on next request") def get_data_boundaries(self, symbol: str, timeframe: str) -> Tuple[Optional[datetime], Optional[datetime]]: """ Get the earliest and latest available data timestamps Args: symbol: Trading pair timeframe: Timeframe Returns: Tuple of (earliest_time, latest_time) or (None, None) if no data """ try: df = self.get_data(symbol, timeframe, limit=10000) if df is not None and not df.empty: return (df.index.min(), df.index.max()) return (None, None) except Exception as e: logger.error(f"Error getting data boundaries: {e}") return (None, None) class TimeRangeManager: """Manages time range calculations and data prefetching""" def __init__(self, data_loader: HistoricalDataLoader): """ Initialize with data loader Args: data_loader: HistoricalDataLoader instance """ self.data_loader = data_loader # Time range presets in seconds self.range_presets = { '1h': 3600, '4h': 14400, '1d': 86400, '1w': 604800, '1M': 2592000 } logger.info("TimeRangeManager initialized") def calculate_time_range(self, center_time: datetime, range_preset: str) -> Tuple[datetime, datetime]: """ Calculate start and end times for a range preset Args: center_time: Center point of the range range_preset: Range preset ('1h', '4h', '1d', '1w', '1M') Returns: Tuple of (start_time, end_time) """ range_seconds = self.range_presets.get(range_preset, 86400) half_range = timedelta(seconds=range_seconds / 2) start_time = center_time - half_range end_time = center_time + half_range return (start_time, end_time) def get_navigation_increment(self, range_preset: str) -> timedelta: """ Get time increment for navigation (10% of range) Args: range_preset: Range preset Returns: timedelta for navigation increment """ range_seconds = self.range_presets.get(range_preset, 86400) increment_seconds = range_seconds / 10 return timedelta(seconds=increment_seconds) def prefetch_adjacent_ranges(self, symbol: str, timeframes: List[str], center_time: datetime, range_preset: str): """ Prefetch data for adjacent time ranges for smooth scrolling Args: symbol: Trading pair timeframes: List of timeframes center_time: Current center time range_preset: Current range preset """ increment = self.get_navigation_increment(range_preset) # Prefetch previous range prev_center = center_time - increment prev_start, prev_end = self.calculate_time_range(prev_center, range_preset) # Prefetch next range next_center = center_time + increment next_start, next_end = self.calculate_time_range(next_center, range_preset) logger.debug(f"Prefetching adjacent ranges for {symbol}") # Prefetch in background (non-blocking) import threading def prefetch(): for timeframe in timeframes: self.data_loader.get_data(symbol, timeframe, prev_start, prev_end) self.data_loader.get_data(symbol, timeframe, next_start, next_end) thread = threading.Thread(target=prefetch, daemon=True) thread.start()