gogo2/training/enhanced_pivot_rl_trainer.py
2025-05-30 20:36:42 +03:00

584 lines
25 KiB
Python

"""
Enhanced Pivot-Based RL Trainer
Integrates Williams Market Structure pivot points with CNN predictions
for improved trading decisions and training rewards.
Key Features:
- Train RL model to buy/sell at local pivot points
- CNN predicts next pivot to avoid late signals
- Different thresholds for entry vs exit
- Rewards for staying uninvested when uncertain
- Uncertainty-based confidence adjustment
"""
import asyncio
import logging
import time
import numpy as np
import pandas as pd
from collections import deque, namedtuple
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any, Union, TYPE_CHECKING
import matplotlib.pyplot as plt
from pathlib import Path
from core.config import get_config
from core.data_provider import DataProvider
from training.williams_market_structure import WilliamsMarketStructure, SwingType, SwingPoint
# Use TYPE_CHECKING to avoid circular import
if TYPE_CHECKING:
from core.enhanced_orchestrator import EnhancedTradingOrchestrator
logger = logging.getLogger(__name__)
class PivotReward:
"""Reward structure for pivot-based trading decisions"""
def __init__(self):
# Pivot-based reward weights
self.pivot_hit_bonus = 2.0 # Bonus for trading at actual pivot points
self.pivot_anticipation_bonus = 1.5 # Bonus for trading before pivot (CNN prediction)
self.wrong_direction_penalty = -1.0 # Penalty for trading opposite to pivot direction
self.late_entry_penalty = -0.5 # Penalty for entering after pivot is confirmed
# Stay uninvested rewards
self.uninvested_reward = 0.1 # Small positive reward for staying out of poor setups
self.avoid_false_signal_bonus = 0.5 # Bonus for avoiding false signals
# Uncertainty penalties
self.overconfidence_penalty = -0.3 # Penalty for being overconfident on losses
self.underconfidence_penalty = -0.1 # Small penalty for being underconfident on wins
class EnhancedPivotRLTrainer:
"""Enhanced RL trainer focused on Williams pivot points and CNN predictions"""
def __init__(self,
data_provider: DataProvider = None,
orchestrator: Optional["EnhancedTradingOrchestrator"] = None):
self.config = get_config()
self.data_provider = data_provider or DataProvider()
self.orchestrator = orchestrator
# Initialize Williams Market Structure with CNN
self.williams = WilliamsMarketStructure(
swing_strengths=[2, 4, 6, 8, 10], # Multiple strengths for better detection
enable_cnn_feature=True,
training_data_provider=data_provider
)
# Pivot tracking
self.recent_pivots = deque(maxlen=50)
self.pivot_predictions = deque(maxlen=20)
self.trade_outcomes = deque(maxlen=100)
# Threshold management - different for entry vs exit
self.entry_threshold = 0.65 # Higher threshold for entering positions
self.exit_threshold = 0.35 # Lower threshold for exiting positions
self.max_uninvested_reward_threshold = 0.60 # Stay out if confidence below this
# Confidence learning parameters
self.confidence_history = deque(maxlen=200)
self.mistake_severity_tracker = deque(maxlen=50)
# Reward calculator
self.pivot_reward = PivotReward()
logger.info("Enhanced Pivot RL Trainer initialized")
logger.info(f"Entry threshold: {self.entry_threshold:.2%}")
logger.info(f"Exit threshold: {self.exit_threshold:.2%}")
logger.info(f"Uninvested reward threshold: {self.max_uninvested_reward_threshold:.2%}")
def calculate_pivot_based_reward(self,
trade_decision: Dict[str, Any],
market_data: pd.DataFrame,
trade_outcome: Dict[str, Any]) -> float:
"""
Calculate enhanced reward based on pivot points and CNN predictions
Args:
trade_decision: The trading decision made by the model
market_data: Market data context
trade_outcome: Actual trade outcome
Returns:
Enhanced reward score
"""
try:
base_pnl = trade_outcome.get('net_pnl', 0.0)
confidence = trade_decision.get('confidence', 0.5)
action = trade_decision.get('action', 'HOLD')
entry_price = trade_decision.get('price', 0.0)
exit_price = trade_outcome.get('exit_price', entry_price)
duration = trade_outcome.get('duration', timedelta(0))
# Base PnL reward
base_reward = base_pnl / 5.0
# 1. Pivot Point Analysis Rewards
pivot_reward = self._calculate_pivot_rewards(
trade_decision, market_data, trade_outcome
)
# 2. CNN Prediction Accuracy Rewards
cnn_reward = self._calculate_cnn_prediction_rewards(
trade_decision, market_data, trade_outcome
)
# 3. Uninvested Period Rewards
uninvested_reward = self._calculate_uninvested_rewards(
trade_decision, confidence
)
# 4. Uncertainty-based Confidence Adjustment
confidence_adjustment = self._calculate_confidence_adjustment(
trade_decision, trade_outcome
)
# 5. Time efficiency with pivot context
time_reward = self._calculate_time_efficiency_reward(
duration, base_pnl, market_data
)
# Combine all rewards
total_reward = (
base_reward +
pivot_reward +
cnn_reward +
uninvested_reward +
confidence_adjustment +
time_reward
)
# Log detailed reward breakdown
self._log_reward_breakdown(
trade_decision, trade_outcome, {
'base': base_reward,
'pivot': pivot_reward,
'cnn': cnn_reward,
'uninvested': uninvested_reward,
'confidence': confidence_adjustment,
'time': time_reward,
'total': total_reward
}
)
# Track for learning
self._track_reward_outcome(trade_decision, trade_outcome, total_reward)
return np.clip(total_reward, -15.0, 10.0)
except Exception as e:
logger.error(f"Error calculating pivot-based reward: {e}")
return 0.0
def _calculate_pivot_rewards(self,
trade_decision: Dict[str, Any],
market_data: pd.DataFrame,
trade_outcome: Dict[str, Any]) -> float:
"""Calculate rewards based on proximity to pivot points"""
try:
entry_price = trade_decision.get('price', 0.0)
action = trade_decision.get('action', 'HOLD')
entry_time = trade_decision.get('timestamp', datetime.now())
net_pnl = trade_outcome.get('net_pnl', 0.0)
# Find recent pivot points from Williams analysis
ohlcv_array = self._convert_dataframe_to_ohlcv_array(market_data)
if ohlcv_array is None or len(ohlcv_array) < 20:
return 0.0
# Get pivot points from Williams structure
structure_levels = self.williams.calculate_recursive_pivot_points(ohlcv_array)
if not structure_levels or 'level_0' not in structure_levels:
return 0.0
level_0_pivots = structure_levels['level_0'].swing_points
if not level_0_pivots:
return 0.0
# Find closest pivot to entry
closest_pivot = self._find_closest_pivot(entry_price, entry_time, level_0_pivots)
if not closest_pivot:
return 0.0
# Calculate distance to pivot (price and time)
price_distance = abs(entry_price - closest_pivot.price) / closest_pivot.price
time_distance = abs((entry_time - closest_pivot.timestamp).total_seconds()) / 3600.0 # hours
pivot_reward = 0.0
# Reward trading at or near pivot points
if price_distance < 0.005: # Within 0.5% of pivot
if time_distance < 0.5: # Within 30 minutes
pivot_reward += self.pivot_reward.pivot_hit_bonus
logger.debug(f"PIVOT HIT BONUS: {self.pivot_reward.pivot_hit_bonus:.2f}")
# Check if trade direction aligns with pivot
if self._trade_aligns_with_pivot(action, closest_pivot, net_pnl):
pivot_reward += self.pivot_reward.pivot_anticipation_bonus
logger.debug(f"PIVOT DIRECTION BONUS: {self.pivot_reward.pivot_anticipation_bonus:.2f}")
else:
pivot_reward += self.pivot_reward.wrong_direction_penalty
logger.debug(f"WRONG DIRECTION PENALTY: {self.pivot_reward.wrong_direction_penalty:.2f}")
# Penalty for late entry after pivot confirmation
if time_distance > 2.0: # More than 2 hours after pivot
pivot_reward += self.pivot_reward.late_entry_penalty
logger.debug(f"LATE ENTRY PENALTY: {self.pivot_reward.late_entry_penalty:.2f}")
return pivot_reward
except Exception as e:
logger.error(f"Error calculating pivot rewards: {e}")
return 0.0
def _calculate_cnn_prediction_rewards(self,
trade_decision: Dict[str, Any],
market_data: pd.DataFrame,
trade_outcome: Dict[str, Any]) -> float:
"""Calculate rewards based on CNN pivot predictions"""
try:
# Check if we have CNN predictions available
if not hasattr(self.williams, 'cnn_model') or not self.williams.cnn_model:
return 0.0
action = trade_decision.get('action', 'HOLD')
confidence = trade_decision.get('confidence', 0.5)
net_pnl = trade_outcome.get('net_pnl', 0.0)
# Get latest CNN prediction if available
# This would be the prediction made before the trade
cnn_prediction = self._get_latest_cnn_prediction()
if not cnn_prediction:
return 0.0
cnn_reward = 0.0
# Reward for following CNN predictions that turn out correct
predicted_direction = self._interpret_cnn_prediction(cnn_prediction)
if predicted_direction == action and net_pnl > 0:
# CNN prediction was correct and we followed it
cnn_reward += 1.0 * confidence # Scale by confidence
logger.debug(f"CNN CORRECT FOLLOW: +{1.0 * confidence:.2f}")
elif predicted_direction != action and net_pnl < 0:
# We didn't follow CNN and it was right (we were wrong)
cnn_reward -= 0.5
logger.debug(f"CNN IGNORE PENALTY: -0.5")
elif predicted_direction == action and net_pnl < 0:
# We followed CNN but it was wrong
cnn_reward -= 0.2 # Small penalty, CNN predictions can be wrong
logger.debug(f"CNN WRONG FOLLOW: -0.2")
return cnn_reward
except Exception as e:
logger.error(f"Error calculating CNN prediction rewards: {e}")
return 0.0
def _calculate_uninvested_rewards(self,
trade_decision: Dict[str, Any],
confidence: float) -> float:
"""Calculate rewards for staying uninvested when uncertain"""
try:
action = trade_decision.get('action', 'HOLD')
# Reward staying out when confidence is low
if action == 'HOLD' and confidence < self.max_uninvested_reward_threshold:
uninvested_reward = self.pivot_reward.uninvested_reward
# Bonus for avoiding very uncertain setups
if confidence < 0.4:
uninvested_reward += self.pivot_reward.avoid_false_signal_bonus
logger.debug(f"AVOID FALSE SIGNAL BONUS: +{self.pivot_reward.avoid_false_signal_bonus:.2f}")
logger.debug(f"UNINVESTED REWARD: +{uninvested_reward:.2f}")
return uninvested_reward
return 0.0
except Exception as e:
logger.error(f"Error calculating uninvested rewards: {e}")
return 0.0
def _calculate_confidence_adjustment(self,
trade_decision: Dict[str, Any],
trade_outcome: Dict[str, Any]) -> float:
"""Adjust rewards based on confidence vs outcome to reduce overconfidence"""
try:
confidence = trade_decision.get('confidence', 0.5)
net_pnl = trade_outcome.get('net_pnl', 0.0)
confidence_adjustment = 0.0
# Track mistake severity
mistake_severity = abs(net_pnl) if net_pnl < 0 else 0.0
self.mistake_severity_tracker.append(mistake_severity)
# Penalize overconfidence on losses
if net_pnl < 0 and confidence > 0.7:
# High confidence but loss - penalize overconfidence
overconfidence_factor = (confidence - 0.7) / 0.3 # 0-1 scale
severity_factor = min(mistake_severity / 2.0, 1.0) # Scale by loss size
penalty = self.pivot_reward.overconfidence_penalty * overconfidence_factor * severity_factor
confidence_adjustment += penalty
logger.debug(f"OVERCONFIDENCE PENALTY: {penalty:.2f} (conf: {confidence:.2f}, loss: ${net_pnl:.2f})")
# Small penalty for underconfidence on wins
elif net_pnl > 0 and confidence < 0.4:
underconfidence_factor = (0.4 - confidence) / 0.4 # 0-1 scale
penalty = self.pivot_reward.underconfidence_penalty * underconfidence_factor
confidence_adjustment += penalty
logger.debug(f"UNDERCONFIDENCE PENALTY: {penalty:.2f} (conf: {confidence:.2f}, profit: ${net_pnl:.2f})")
# Update confidence learning
self._update_confidence_learning(confidence, net_pnl, mistake_severity)
return confidence_adjustment
except Exception as e:
logger.error(f"Error calculating confidence adjustment: {e}")
return 0.0
def _calculate_time_efficiency_reward(self,
duration: timedelta,
net_pnl: float,
market_data: pd.DataFrame) -> float:
"""Calculate time-based rewards considering market context"""
try:
duration_hours = duration.total_seconds() / 3600.0
# Quick profitable trades get bonus
if net_pnl > 0 and duration_hours < 0.5: # Less than 30 minutes
return 0.3
# Holding losses too long gets penalty
elif net_pnl < 0 and duration_hours > 2.0: # More than 2 hours
return -0.5
return 0.0
except Exception as e:
logger.error(f"Error calculating time efficiency reward: {e}")
return 0.0
def update_thresholds_based_on_performance(self):
"""Dynamically adjust entry/exit thresholds based on recent performance"""
try:
if len(self.trade_outcomes) < 20:
return
recent_outcomes = list(self.trade_outcomes)[-20:]
# Calculate win rate and average PnL
wins = sum(1 for outcome in recent_outcomes if outcome['net_pnl'] > 0)
win_rate = wins / len(recent_outcomes)
avg_pnl = np.mean([outcome['net_pnl'] for outcome in recent_outcomes])
# Adjust thresholds based on performance
if win_rate < 0.4: # Low win rate - be more selective
self.entry_threshold = min(self.entry_threshold + 0.02, 0.80)
logger.info(f"Low win rate ({win_rate:.2%}) - increased entry threshold to {self.entry_threshold:.2%}")
elif win_rate > 0.6 and avg_pnl > 0: # High win rate - can be more aggressive
self.entry_threshold = max(self.entry_threshold - 0.01, 0.50)
logger.info(f"High win rate ({win_rate:.2%}) - decreased entry threshold to {self.entry_threshold:.2%}")
# Adjust exit threshold based on loss severity
avg_loss_severity = np.mean(list(self.mistake_severity_tracker)) if self.mistake_severity_tracker else 0
if avg_loss_severity > 1.0: # Large average losses
self.exit_threshold = max(self.exit_threshold - 0.01, 0.20)
logger.info(f"High loss severity - decreased exit threshold to {self.exit_threshold:.2%}")
except Exception as e:
logger.error(f"Error updating thresholds: {e}")
def get_current_thresholds(self) -> Dict[str, float]:
"""Get current entry and exit thresholds"""
return {
'entry_threshold': self.entry_threshold,
'exit_threshold': self.exit_threshold,
'uninvested_threshold': self.max_uninvested_reward_threshold
}
# Helper methods
def _convert_dataframe_to_ohlcv_array(self, df: pd.DataFrame) -> Optional[np.ndarray]:
"""Convert pandas DataFrame to numpy array for Williams analysis"""
try:
if df.empty:
return None
# Ensure we have required columns
required_cols = ['open', 'high', 'low', 'close', 'volume']
if not all(col in df.columns for col in required_cols):
return None
# Convert to numpy array
timestamps = df.index.astype(np.int64) // 10**9 # Convert to Unix timestamp
ohlcv_array = np.column_stack([
timestamps,
df['open'].values,
df['high'].values,
df['low'].values,
df['close'].values,
df['volume'].values
])
return ohlcv_array.astype(np.float64)
except Exception as e:
logger.error(f"Error converting DataFrame to OHLCV array: {e}")
return None
def _find_closest_pivot(self,
entry_price: float,
entry_time: datetime,
pivots: List[SwingPoint]) -> Optional[SwingPoint]:
"""Find the closest pivot point to the trade entry"""
try:
if not pivots:
return None
# Find pivot closest in time and price
best_pivot = None
best_score = float('inf')
for pivot in pivots:
time_diff = abs((entry_time - pivot.timestamp).total_seconds()) / 3600.0
price_diff = abs(entry_price - pivot.price) / pivot.price
# Combined score (weighted by time and price proximity)
score = time_diff * 0.3 + price_diff * 100 # Weight price difference more heavily
if score < best_score:
best_score = score
best_pivot = pivot
return best_pivot
except Exception as e:
logger.error(f"Error finding closest pivot: {e}")
return None
def _trade_aligns_with_pivot(self,
action: str,
pivot: SwingPoint,
net_pnl: float) -> bool:
"""Check if trade direction aligns with pivot type and was profitable"""
try:
if net_pnl <= 0: # Only consider profitable trades as aligned
return False
if action == 'BUY' and pivot.swing_type == SwingType.SWING_LOW:
return True # Bought at/near swing low
elif action == 'SELL' and pivot.swing_type == SwingType.SWING_HIGH:
return True # Sold at/near swing high
return False
except Exception as e:
logger.error(f"Error checking trade alignment: {e}")
return False
def _get_latest_cnn_prediction(self) -> Optional[np.ndarray]:
"""Get the latest CNN prediction from Williams structure"""
try:
# This would access the Williams CNN model's latest prediction
# For now, return None if not available
if hasattr(self.williams, 'latest_cnn_prediction'):
return self.williams.latest_cnn_prediction
return None
except Exception as e:
logger.error(f"Error getting CNN prediction: {e}")
return None
def _interpret_cnn_prediction(self, prediction: np.ndarray) -> str:
"""Interpret CNN prediction array to trading action"""
try:
if len(prediction) < 2:
return 'HOLD'
# Assuming prediction format: [type, price] for level 0
predicted_type = prediction[0] # 0 = LOW, 1 = HIGH
if predicted_type > 0.5:
return 'SELL' # Expecting swing high - sell
else:
return 'BUY' # Expecting swing low - buy
except Exception as e:
logger.error(f"Error interpreting CNN prediction: {e}")
return 'HOLD'
def _update_confidence_learning(self,
confidence: float,
net_pnl: float,
mistake_severity: float):
"""Update confidence learning parameters"""
try:
self.confidence_history.append({
'confidence': confidence,
'net_pnl': net_pnl,
'mistake_severity': mistake_severity,
'timestamp': datetime.now()
})
# Periodically update thresholds based on confidence patterns
if len(self.confidence_history) % 10 == 0:
self.update_thresholds_based_on_performance()
except Exception as e:
logger.error(f"Error updating confidence learning: {e}")
def _track_reward_outcome(self,
trade_decision: Dict[str, Any],
trade_outcome: Dict[str, Any],
total_reward: float):
"""Track reward outcomes for analysis"""
try:
outcome_record = {
'timestamp': datetime.now(),
'action': trade_decision.get('action'),
'confidence': trade_decision.get('confidence'),
'net_pnl': trade_outcome.get('net_pnl'),
'reward': total_reward,
'duration': trade_outcome.get('duration')
}
self.trade_outcomes.append(outcome_record)
except Exception as e:
logger.error(f"Error tracking reward outcome: {e}")
def _log_reward_breakdown(self,
trade_decision: Dict[str, Any],
trade_outcome: Dict[str, Any],
rewards: Dict[str, float]):
"""Log detailed reward breakdown"""
try:
action = trade_decision.get('action', 'UNKNOWN')
confidence = trade_decision.get('confidence', 0.0)
net_pnl = trade_outcome.get('net_pnl', 0.0)
logger.info(f"[REWARD] {action} (conf: {confidence:.2%}) PnL: ${net_pnl:.2f} -> Total: {rewards['total']:.2f}")
logger.debug(f" Base: {rewards['base']:.2f}, Pivot: {rewards['pivot']:.2f}, CNN: {rewards['cnn']:.2f}")
logger.debug(f" Uninvested: {rewards['uninvested']:.2f}, Confidence: {rewards['confidence']:.2f}, Time: {rewards['time']:.2f}")
except Exception as e:
logger.error(f"Error logging reward breakdown: {e}")
def create_enhanced_pivot_trainer(data_provider: DataProvider = None,
orchestrator: Optional["EnhancedTradingOrchestrator"] = None) -> EnhancedPivotRLTrainer:
"""Factory function to create enhanced pivot trainer"""
return EnhancedPivotRLTrainer(data_provider, orchestrator)