improved data structure

2025-10-31 00:44:08 +02:00
parent b8f54e61fa
commit 7ddf98bf18
16 changed files with 5892 additions and 35 deletions
--- a/core/data_models.py
+++ b/core/data_models.py
@@ -15,7 +15,12 @@ from dataclasses import dataclass, field

@dataclass
 class OHLCVBar:
-    """OHLCV bar data structure"""
+    """
+    Enhanced OHLCV bar data structure with technical analysis features
+    
+    Includes candle pattern recognition, relative sizing, body/wick analysis,
+    and Williams pivot points metadata for improved model feature engineering.
+    """
    symbol: str
    timestamp: datetime
    open: float
@@ -25,6 +30,189 @@ class OHLCVBar:
    volume: float
    timeframe: str
    indicators: Dict[str, float] = field(default_factory=dict)
+    
+    # Pivot points metadata
+    pivot_distance_to_support: Optional[float] = None
+    pivot_distance_to_resistance: Optional[float] = None
+    pivot_level_context: Optional[Dict[str, Any]] = field(default=None)
+    near_pivot_support: bool = False
+    near_pivot_resistance: bool = False
+    
+    # Candle characteristics (computed on-demand or cached)
+    _body_size: Optional[float] = field(default=None, repr=False)
+    _upper_wick: Optional[float] = field(default=None, repr=False)
+    _lower_wick: Optional[float] = field(default=None, repr=False)
+    _total_range: Optional[float] = field(default=None, repr=False)
+    _is_bullish: Optional[bool] = field(default=None, repr=False)
+    
+    @property
+    def body_size(self) -> float:
+        """Absolute size of candle body"""
+        if self._body_size is None:
+            self._body_size = abs(self.close - self.open)
+        return self._body_size
+    
+    @property
+    def upper_wick(self) -> float:
+        """Size of upper wick/shadow"""
+        if self._upper_wick is None:
+            self._upper_wick = self.high - max(self.open, self.close)
+        return self._upper_wick
+    
+    @property
+    def lower_wick(self) -> float:
+        """Size of lower wick/shadow"""
+        if self._lower_wick is None:
+            self._lower_wick = min(self.open, self.close) - self.low
+        return self._lower_wick
+    
+    @property
+    def total_range(self) -> float:
+        """Total high-low range"""
+        if self._total_range is None:
+            self._total_range = self.high - self.low
+        return self._total_range
+    
+    @property
+    def is_bullish(self) -> bool:
+        """True if close > open (hollow/green candle)"""
+        if self._is_bullish is None:
+            self._is_bullish = self.close > self.open
+        return self._is_bullish
+    
+    @property
+    def is_bearish(self) -> bool:
+        """True if close < open (solid/red candle)"""
+        return not self.is_bullish and self.close != self.open
+    
+    @property
+    def is_doji(self) -> bool:
+        """True if open ≈ close (doji pattern)"""
+        return self.body_size < (self.total_range * 0.1) if self.total_range > 0 else True
+    
+    def get_body_to_range_ratio(self) -> float:
+        """Body size as percentage of total range (0.0 to 1.0)"""
+        return self.body_size / self.total_range if self.total_range > 0 else 0.0
+    
+    def get_upper_wick_ratio(self) -> float:
+        """Upper wick as percentage of total range (0.0 to 1.0)"""
+        return self.upper_wick / self.total_range if self.total_range > 0 else 0.0
+    
+    def get_lower_wick_ratio(self) -> float:
+        """Lower wick as percentage of total range (0.0 to 1.0)"""
+        return self.lower_wick / self.total_range if self.total_range > 0 else 0.0
+    
+    def get_relative_size(self, reference_bars: List['OHLCVBar'], method: str = 'avg') -> float:
+        """
+        Get relative size compared to reference bars
+        
+        Args:
+            reference_bars: List of previous bars for comparison
+            method: 'avg' (average), 'max' (maximum), or 'median'
+        
+        Returns:
+            Ratio of current range to reference (1.0 = same size, >1.0 = larger, <1.0 = smaller)
+        """
+        if not reference_bars:
+            return 1.0
+        
+        reference_ranges = [bar.total_range for bar in reference_bars if bar.total_range > 0]
+        if not reference_ranges:
+            return 1.0
+        
+        if method == 'avg':
+            reference_value = np.mean(reference_ranges)
+        elif method == 'max':
+            reference_value = np.max(reference_ranges)
+        elif method == 'median':
+            reference_value = np.median(reference_ranges)
+        else:
+            reference_value = np.mean(reference_ranges)
+        
+        return self.total_range / reference_value if reference_value > 0 else 1.0
+    
+    def get_candle_pattern(self) -> str:
+        """
+        Identify basic candle pattern
+        
+        Returns:
+            Pattern name: 'doji', 'hammer', 'shooting_star', 'spinning_top', 
+                         'marubozu_bullish', 'marubozu_bearish', 'standard'
+        """
+        if self.total_range == 0:
+            return 'doji'
+        
+        body_ratio = self.get_body_to_range_ratio()
+        upper_ratio = self.get_upper_wick_ratio()
+        lower_ratio = self.get_lower_wick_ratio()
+        
+        # Doji: very small body
+        if body_ratio < 0.1:
+            return 'doji'
+        
+        # Marubozu: very small wicks (>90% body)
+        if body_ratio > 0.9:
+            return 'marubozu_bullish' if self.is_bullish else 'marubozu_bearish'
+        
+        # Hammer: small body at top, long lower wick
+        if body_ratio < 0.3 and lower_ratio > 0.6 and upper_ratio < 0.1:
+            return 'hammer'
+        
+        # Shooting star: small body at bottom, long upper wick
+        if body_ratio < 0.3 and upper_ratio > 0.6 and lower_ratio < 0.1:
+            return 'shooting_star'
+        
+        # Spinning top: small body, both wicks present
+        if body_ratio < 0.3 and (upper_ratio + lower_ratio) > 0.6:
+            return 'spinning_top'
+        
+        return 'standard'
+    
+    def get_ta_features(self, reference_bars: Optional[List['OHLCVBar']] = None) -> Dict[str, float]:
+        """
+        Get all technical analysis features as a dictionary
+        
+        Args:
+            reference_bars: Optional list of previous bars for relative sizing
+        
+        Returns:
+            Dictionary of TA features suitable for model input
+        """
+        features = {
+            # Basic candle properties
+            'is_bullish': 1.0 if self.is_bullish else 0.0,
+            'is_bearish': 1.0 if self.is_bearish else 0.0,
+            'is_doji': 1.0 if self.is_doji else 0.0,
+            
+            # Size ratios
+            'body_to_range_ratio': self.get_body_to_range_ratio(),
+            'upper_wick_ratio': self.get_upper_wick_ratio(),
+            'lower_wick_ratio': self.get_lower_wick_ratio(),
+            
+            # Absolute sizes (normalized by close price)
+            'body_size_pct': self.body_size / self.close if self.close > 0 else 0.0,
+            'upper_wick_pct': self.upper_wick / self.close if self.close > 0 else 0.0,
+            'lower_wick_pct': self.lower_wick / self.close if self.close > 0 else 0.0,
+            'total_range_pct': self.total_range / self.close if self.close > 0 else 0.0,
+            
+            # Volume relative to price movement
+            'volume_per_range': self.volume / self.total_range if self.total_range > 0 else 0.0,
+        }
+        
+        # Add relative sizing if reference bars provided
+        if reference_bars:
+            features['relative_size_avg'] = self.get_relative_size(reference_bars, 'avg')
+            features['relative_size_max'] = self.get_relative_size(reference_bars, 'max')
+            features['relative_size_median'] = self.get_relative_size(reference_bars, 'median')
+        
+        # Add pattern encoding (one-hot style)
+        pattern = self.get_candle_pattern()
+        pattern_types = ['doji', 'hammer', 'shooting_star', 'spinning_top', 
+                        'marubozu_bullish', 'marubozu_bearish', 'standard']
+        for p in pattern_types:
+            features[f'pattern_{p}'] = 1.0 if pattern == p else 0.0
+        
+        return features

@dataclass
 class PivotPoint:
@@ -66,6 +254,44 @@ class COBData:
    ma_15s_imbalance: Dict[float, float] = field(default_factory=dict)  # 15s MA
    ma_60s_imbalance: Dict[float, float] = field(default_factory=dict)  # 60s MA

+@dataclass
+class NormalizationBounds:
+    """Normalization boundaries for price and volume data"""
+    price_min: float
+    price_max: float
+    volume_min: float
+    volume_max: float
+    symbol: str
+    timeframe: str = 'all'  # 'all' means across all timeframes
+    
+    def normalize_price(self, price: float) -> float:
+        """Normalize price to 0-1 range"""
+        if self.price_max == self.price_min:
+            return 0.5
+        return (price - self.price_min) / (self.price_max - self.price_min)
+    
+    def denormalize_price(self, normalized: float) -> float:
+        """Denormalize price from 0-1 range back to original"""
+        return normalized * (self.price_max - self.price_min) + self.price_min
+    
+    def normalize_volume(self, volume: float) -> float:
+        """Normalize volume to 0-1 range"""
+        if self.volume_max == self.volume_min:
+            return 0.5
+        return (volume - self.volume_min) / (self.volume_max - self.volume_min)
+    
+    def denormalize_volume(self, normalized: float) -> float:
+        """Denormalize volume from 0-1 range back to original"""
+        return normalized * (self.volume_max - self.volume_min) + self.volume_min
+    
+    def get_price_range(self) -> float:
+        """Get price range"""
+        return self.price_max - self.price_min
+    
+    def get_volume_range(self) -> float:
+        """Get volume range"""
+        return self.volume_max - self.volume_min
+
@dataclass
 class BaseDataInput:
    """
@@ -75,6 +301,7 @@ class BaseDataInput:
    - OHLCV: 300 frames of (1s, 1m, 1h, 1d) ETH + 300s of 1s BTC
    - COB: ±20 buckets of COB amounts in USD for each 1s OHLCV
    - MA: 1s, 5s, 15s, and 60s MA of COB imbalance counting ±5 COB buckets
+    - All OHLCV data is normalized to 0-1 range based on daily (longest timeframe) min/max
    """
    symbol: str  # Primary symbol (ETH/USDT)
    timestamp: datetime
@@ -111,42 +338,224 @@ class BaseDataInput:
    # Position and trading state information
    position_info: Dict[str, Any] = field(default_factory=dict)
    
-    def get_feature_vector(self) -> np.ndarray:
+    # Normalization boundaries (computed on-demand, cached)
+    _normalization_bounds: Optional[NormalizationBounds] = field(default=None, repr=False)
+    _btc_normalization_bounds: Optional[NormalizationBounds] = field(default=None, repr=False)
+    
+    def _compute_normalization_bounds(self) -> NormalizationBounds:
+        """
+        Compute normalization bounds from daily (longest timeframe) data
+        
+        Uses daily data as it has the widest price range, ensuring all shorter
+        timeframes are normalized within 0-1 range.
+        
+        Returns:
+            NormalizationBounds: Min/max for price and volume
+        """
+        if self._normalization_bounds is not None:
+            return self._normalization_bounds
+        
+        # Collect all OHLCV data, prioritizing daily for widest range
+        all_prices = []
+        all_volumes = []
+        
+        # Use daily data first (widest range)
+        for bar in self.ohlcv_1d:
+            all_prices.extend([bar.open, bar.high, bar.low, bar.close])
+            all_volumes.append(bar.volume)
+        
+        # Add other timeframes to ensure coverage
+        for ohlcv_list in [self.ohlcv_1h, self.ohlcv_1m, self.ohlcv_1s]:
+            for bar in ohlcv_list:
+                all_prices.extend([bar.open, bar.high, bar.low, bar.close])
+                all_volumes.append(bar.volume)
+        
+        # Compute bounds
+        if all_prices and all_volumes:
+            price_min = min(all_prices)
+            price_max = max(all_prices)
+            volume_min = min(all_volumes)
+            volume_max = max(all_volumes)
+        else:
+            # Fallback if no data
+            price_min = price_max = 0.0
+            volume_min = volume_max = 0.0
+        
+        self._normalization_bounds = NormalizationBounds(
+            price_min=price_min,
+            price_max=price_max,
+            volume_min=volume_min,
+            volume_max=volume_max,
+            symbol=self.symbol,
+            timeframe='all'
+        )
+        
+        return self._normalization_bounds
+    
+    def _compute_btc_normalization_bounds(self) -> NormalizationBounds:
+        """
+        Compute normalization bounds for BTC data
+        
+        Returns:
+            NormalizationBounds: Min/max for BTC price and volume
+        """
+        if self._btc_normalization_bounds is not None:
+            return self._btc_normalization_bounds
+        
+        all_prices = []
+        all_volumes = []
+        
+        for bar in self.btc_ohlcv_1s:
+            all_prices.extend([bar.open, bar.high, bar.low, bar.close])
+            all_volumes.append(bar.volume)
+        
+        if all_prices and all_volumes:
+            price_min = min(all_prices)
+            price_max = max(all_prices)
+            volume_min = min(all_volumes)
+            volume_max = max(all_volumes)
+        else:
+            price_min = price_max = 0.0
+            volume_min = volume_max = 0.0
+        
+        self._btc_normalization_bounds = NormalizationBounds(
+            price_min=price_min,
+            price_max=price_max,
+            volume_min=volume_min,
+            volume_max=volume_max,
+            symbol='BTC/USDT',
+            timeframe='1s'
+        )
+        
+        return self._btc_normalization_bounds
+    
+    def get_normalization_bounds(self) -> NormalizationBounds:
+        """Get normalization bounds for primary symbol (cached)"""
+        return self._compute_normalization_bounds()
+    
+    def get_btc_normalization_bounds(self) -> NormalizationBounds:
+        """Get normalization bounds for BTC (cached)"""
+        return self._compute_btc_normalization_bounds()
+    
+    def get_feature_vector(self, include_candle_ta: bool = True, normalize: bool = True) -> np.ndarray:
        """
        Convert BaseDataInput to standardized feature vector for models
        
+        Args:
+            include_candle_ta: If True, include enhanced candle TA features (default: True)
+            normalize: If True, normalize OHLCV data to 0-1 range (default: True)
+        
        Returns:
-            np.ndarray: FIXED SIZE standardized feature vector (7850 features)
+            np.ndarray: FIXED SIZE standardized feature vector (7870 or 22880 features)
+            
+        Note:
+            - Full TA features are enabled by default for better model performance
+            - Normalization uses daily (longest timeframe) min/max for primary symbol
+            - BTC data is normalized independently using its own min/max
+            - Normalization bounds are cached and accessible via get_normalization_bounds()
+            - Includes pivot points metadata (10 features) for market structure context
        """
        # FIXED FEATURE SIZE - this should NEVER change at runtime
-        FIXED_FEATURE_SIZE = 7850
+        # Standard: 7870 features (7850 + 10 pivot + 10 more indicators)
+        # With candle TA: 22880 features (22850 + 10 pivot + 10 more indicators)
+        FIXED_FEATURE_SIZE = 22880 if include_candle_ta else 7870
        features = []
        
-        # OHLCV features for ETH (up to 300 frames x 4 timeframes x 5 features)
+        # Get normalization bounds (cached)
+        if normalize:
+            norm_bounds = self._compute_normalization_bounds()
+        
+        # OHLCV features for ETH (up to 300 frames x 4 timeframes x 5 or 15 features)
        for ohlcv_list in [self.ohlcv_1s, self.ohlcv_1m, self.ohlcv_1h, self.ohlcv_1d]:
            # Use actual data only, up to 300 frames
            ohlcv_frames = ohlcv_list[-300:] if len(ohlcv_list) >= 300 else ohlcv_list
            
            # Extract features from actual frames
-            for bar in ohlcv_frames:
-                features.extend([bar.open, bar.high, bar.low, bar.close, bar.volume])
+            for i, bar in enumerate(ohlcv_frames):
+                # Basic OHLCV (5 features) - normalized to 0-1 range
+                if normalize:
+                    features.extend([
+                        norm_bounds.normalize_price(bar.open),
+                        norm_bounds.normalize_price(bar.high),
+                        norm_bounds.normalize_price(bar.low),
+                        norm_bounds.normalize_price(bar.close),
+                        norm_bounds.normalize_volume(bar.volume)
+                    ])
+                else:
+                    features.extend([bar.open, bar.high, bar.low, bar.close, bar.volume])
+                
+                # Enhanced candle TA features (10 additional features per bar)
+                if include_candle_ta:
+                    # Get reference bars for relative sizing (last 10 bars)
+                    ref_start = max(0, i - 10)
+                    reference_bars = ohlcv_frames[ref_start:i] if i > 0 else []
+                    
+                    ta_features = bar.get_ta_features(reference_bars)
+                    # Extract key features in fixed order
+                    features.extend([
+                        ta_features.get('is_bullish', 0.0),
+                        ta_features.get('body_to_range_ratio', 0.0),
+                        ta_features.get('upper_wick_ratio', 0.0),
+                        ta_features.get('lower_wick_ratio', 0.0),
+                        ta_features.get('body_size_pct', 0.0),
+                        ta_features.get('total_range_pct', 0.0),
+                        ta_features.get('relative_size_avg', 1.0),
+                        ta_features.get('pattern_doji', 0.0),
+                        ta_features.get('pattern_hammer', 0.0),
+                        ta_features.get('pattern_shooting_star', 0.0),
+                    ])
            
            # Pad with zeros only if we have some data but less than 300 frames
            frames_needed = 300 - len(ohlcv_frames)
            if frames_needed > 0:
-                features.extend([0.0] * (frames_needed * 5))  # 5 features per frame
+                features_per_frame = 15 if include_candle_ta else 5
+                features.extend([0.0] * (frames_needed * features_per_frame))
        
-        # BTC OHLCV features (up to 300 frames x 5 features = 1500 features)
+        # BTC OHLCV features (up to 300 frames x 5 or 15 features)
        btc_frames = self.btc_ohlcv_1s[-300:] if len(self.btc_ohlcv_1s) >= 300 else self.btc_ohlcv_1s
        
+        # Get BTC normalization bounds (cached, independent from primary symbol)
+        if normalize:
+            btc_norm_bounds = self._compute_btc_normalization_bounds()
+        
        # Extract features from actual BTC frames
-        for bar in btc_frames:
-            features.extend([bar.open, bar.high, bar.low, bar.close, bar.volume])
+        for i, bar in enumerate(btc_frames):
+            # Basic OHLCV (5 features) - normalized to 0-1 range
+            if normalize:
+                features.extend([
+                    btc_norm_bounds.normalize_price(bar.open),
+                    btc_norm_bounds.normalize_price(bar.high),
+                    btc_norm_bounds.normalize_price(bar.low),
+                    btc_norm_bounds.normalize_price(bar.close),
+                    btc_norm_bounds.normalize_volume(bar.volume)
+                ])
+            else:
+                features.extend([bar.open, bar.high, bar.low, bar.close, bar.volume])
+            
+            # Enhanced candle TA features (10 additional features per bar)
+            if include_candle_ta:
+                ref_start = max(0, i - 10)
+                reference_bars = btc_frames[ref_start:i] if i > 0 else []
+                
+                ta_features = bar.get_ta_features(reference_bars)
+                features.extend([
+                    ta_features.get('is_bullish', 0.0),
+                    ta_features.get('body_to_range_ratio', 0.0),
+                    ta_features.get('upper_wick_ratio', 0.0),
+                    ta_features.get('lower_wick_ratio', 0.0),
+                    ta_features.get('body_size_pct', 0.0),
+                    ta_features.get('total_range_pct', 0.0),
+                    ta_features.get('relative_size_avg', 1.0),
+                    ta_features.get('pattern_doji', 0.0),
+                    ta_features.get('pattern_hammer', 0.0),
+                    ta_features.get('pattern_shooting_star', 0.0),
+                ])
        
        # Pad with zeros only if we have some data but less than 300 frames
        btc_frames_needed = 300 - len(btc_frames)
        if btc_frames_needed > 0:
-            features.extend([0.0] * (btc_frames_needed * 5))  # 5 features per frame
+            features_per_frame = 15 if include_candle_ta else 5
+            features.extend([0.0] * (btc_frames_needed * features_per_frame))
        
        # COB features (FIXED SIZE: 200 features)
        cob_features = []
@@ -209,10 +618,42 @@ class BaseDataInput:
        cob_features.extend([0.0] * (200 - len(cob_features)))
        features.extend(cob_features[:200])  # Ensure exactly 200 COB features
        
-        # Technical indicators (FIXED SIZE: 100 features)
+        # Technical indicators (FIXED SIZE: 110 features - expanded to accommodate more indicators)
        indicator_values = list(self.technical_indicators.values())
-        features.extend(indicator_values[:100])  # Take first 100 indicators
-        features.extend([0.0] * max(0, 100 - len(indicator_values)))  # Pad to exactly 100
+        features.extend(indicator_values[:110])  # Take first 110 indicators
+        features.extend([0.0] * max(0, 110 - len(indicator_values)))  # Pad to exactly 110
+        
+        # Pivot points metadata (FIXED SIZE: 10 features)
+        # Extract pivot context from most recent OHLCV bars
+        pivot_features = []
+        if self.ohlcv_1m and len(self.ohlcv_1m) > 0:
+            latest_bar = self.ohlcv_1m[-1]
+            pivot_features.extend([
+                latest_bar.pivot_distance_to_support if latest_bar.pivot_distance_to_support is not None else 0.0,
+                latest_bar.pivot_distance_to_resistance if latest_bar.pivot_distance_to_resistance is not None else 0.0,
+                1.0 if latest_bar.near_pivot_support else 0.0,
+                1.0 if latest_bar.near_pivot_resistance else 0.0,
+            ])
+            # Add pivot level context if available
+            if latest_bar.pivot_level_context:
+                ctx = latest_bar.pivot_level_context
+                pivot_features.extend([
+                    ctx.get('trend_strength', 0.0),
+                    ctx.get('support_count', 0.0),
+                    ctx.get('resistance_count', 0.0),
+                    ctx.get('price_position_in_range', 0.5),  # 0=at support, 1=at resistance
+                    ctx.get('distance_to_nearest_level', 0.0),
+                    ctx.get('level_strength', 0.0),
+                ])
+            else:
+                pivot_features.extend([0.0] * 6)
+        else:
+            pivot_features = [0.0] * 10
+        
+        # Ensure exactly 10 pivot features
+        pivot_features = pivot_features[:10]
+        pivot_features.extend([0.0] * (10 - len(pivot_features)))
+        features.extend(pivot_features)
        
        # Last predictions from other models (FIXED SIZE: 45 features)
        prediction_features = []