beef up T model

This commit is contained in:
Dobromir Popov
2025-07-02 01:26:07 +03:00
parent 0c8ae823ba
commit 8645f6e8dd
3 changed files with 156 additions and 75 deletions

View File

@ -23,35 +23,40 @@ logger = logging.getLogger(__name__)
@dataclass @dataclass
class TradingTransformerConfig: class TradingTransformerConfig:
"""Configuration for trading transformer models""" """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
# Model architecture # Model architecture - SCALED UP
d_model: int = 512 # Model dimension d_model: int = 1024 # Model dimension (2x increase)
n_heads: int = 8 # Number of attention heads n_heads: int = 16 # Number of attention heads (2x increase)
n_layers: int = 6 # Number of transformer layers n_layers: int = 12 # Number of transformer layers (2x increase)
d_ff: int = 2048 # Feed-forward dimension d_ff: int = 4096 # Feed-forward dimension (2x increase)
dropout: float = 0.1 # Dropout rate dropout: float = 0.1 # Dropout rate
# Input dimensions # Input dimensions - ENHANCED
seq_len: int = 100 # Sequence length for time series seq_len: int = 150 # Sequence length for time series (1.5x increase)
cob_features: int = 50 # COB feature dimension cob_features: int = 100 # COB feature dimension (2x increase)
tech_features: int = 20 # Technical indicator features tech_features: int = 40 # Technical indicator features (2x increase)
market_features: int = 15 # Market microstructure features market_features: int = 30 # Market microstructure features (2x increase)
# Output configuration # Output configuration
n_actions: int = 3 # BUY, SELL, HOLD n_actions: int = 3 # BUY, SELL, HOLD
confidence_output: bool = True # Output confidence scores confidence_output: bool = True # Output confidence scores
# Training configuration # Training configuration - OPTIMIZED FOR LARGER MODEL
learning_rate: float = 1e-4 learning_rate: float = 5e-5 # Reduced for larger model
weight_decay: float = 1e-5 weight_decay: float = 1e-4 # Increased regularization
warmup_steps: int = 4000 warmup_steps: int = 8000 # More warmup steps
max_grad_norm: float = 1.0 max_grad_norm: float = 0.5 # Tighter gradient clipping
# Advanced features # Advanced features - ENHANCED
use_relative_position: bool = True use_relative_position: bool = True
use_multi_scale_attention: bool = True use_multi_scale_attention: bool = True
use_market_regime_detection: bool = True use_market_regime_detection: bool = True
use_uncertainty_estimation: bool = True use_uncertainty_estimation: bool = True
# NEW: Additional scaling features
use_deep_attention: bool = True # Deeper attention mechanisms
use_residual_connections: bool = True # Enhanced residual connections
use_layer_norm_variants: bool = True # Advanced normalization
class PositionalEncoding(nn.Module): class PositionalEncoding(nn.Module):
"""Sinusoidal positional encoding for transformer""" """Sinusoidal positional encoding for transformer"""
@ -102,10 +107,10 @@ class RelativePositionalEncoding(nn.Module):
return self.relative_position_embeddings(final_mat) return self.relative_position_embeddings(final_mat)
class MultiScaleAttention(nn.Module): class DeepMultiScaleAttention(nn.Module):
"""Multi-scale attention for capturing different time horizons""" """Enhanced multi-scale attention with deeper mechanisms for 46M parameter model"""
def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7]): def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7, 11, 15]):
super().__init__() super().__init__()
self.d_model = d_model self.d_model = d_model
self.n_heads = n_heads self.n_heads = n_heads
@ -114,18 +119,49 @@ class MultiScaleAttention(nn.Module):
assert d_model % n_heads == 0, "d_model must be divisible by n_heads" assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
# Multi-scale projections # Enhanced multi-scale projections with deeper architecture
self.scale_projections = nn.ModuleList([ self.scale_projections = nn.ModuleList([
nn.ModuleDict({ nn.ModuleDict({
'query': nn.Linear(d_model, d_model), 'query': nn.Sequential(
'key': nn.Linear(d_model, d_model), nn.Linear(d_model, d_model * 2),
'value': nn.Linear(d_model, d_model), nn.GELU(),
'conv': nn.Conv1d(d_model, d_model, kernel_size=scale, nn.Dropout(0.1),
padding=scale//2, groups=d_model) nn.Linear(d_model * 2, d_model)
),
'key': nn.Sequential(
nn.Linear(d_model, d_model * 2),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(d_model * 2, d_model)
),
'value': nn.Sequential(
nn.Linear(d_model, d_model * 2),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(d_model * 2, d_model)
),
'conv': nn.Sequential(
nn.Conv1d(d_model, d_model * 2, kernel_size=scale,
padding=scale//2, groups=d_model),
nn.GELU(),
nn.Conv1d(d_model * 2, d_model, kernel_size=1)
)
}) for scale in scales }) for scale in scales
]) ])
self.output_projection = nn.Linear(d_model * len(scales), d_model) # Enhanced output projection with residual connection
self.output_projection = nn.Sequential(
nn.Linear(d_model * len(scales), d_model * 2),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(d_model * 2, d_model)
)
# Additional attention mechanisms
self.cross_scale_attention = nn.MultiheadAttention(
d_model, n_heads // 2, dropout=0.1, batch_first=True
)
self.dropout = nn.Dropout(0.1) self.dropout = nn.Dropout(0.1)
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
@ -133,10 +169,10 @@ class MultiScaleAttention(nn.Module):
scale_outputs = [] scale_outputs = []
for scale_proj in self.scale_projections: for scale_proj in self.scale_projections:
# Apply temporal convolution for this scale # Apply enhanced temporal convolution for this scale
x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2) x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
# Standard attention computation # Enhanced attention computation with deeper projections
Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
@ -160,9 +196,15 @@ class MultiScaleAttention(nn.Module):
scale_outputs.append(output) scale_outputs.append(output)
# Combine multi-scale outputs # Combine multi-scale outputs with enhanced projection
combined = torch.cat(scale_outputs, dim=-1) combined = torch.cat(scale_outputs, dim=-1)
return self.output_projection(combined) output = self.output_projection(combined)
# Apply cross-scale attention for better integration
cross_attended, _ = self.cross_scale_attention(output, output, output, attn_mask=mask)
# Residual connection
return output + cross_attended
class MarketRegimeDetector(nn.Module): class MarketRegimeDetector(nn.Module):
"""Market regime detection module for adaptive behavior""" """Market regime detection module for adaptive behavior"""
@ -249,9 +291,9 @@ class TradingTransformerLayer(nn.Module):
super().__init__() super().__init__()
self.config = config self.config = config
# Multi-scale attention or standard attention # Enhanced multi-scale attention or standard attention
if config.use_multi_scale_attention: if config.use_multi_scale_attention:
self.attention = MultiScaleAttention(config.d_model, config.n_heads) self.attention = DeepMultiScaleAttention(config.d_model, config.n_heads)
else: else:
self.attention = nn.MultiheadAttention( self.attention = nn.MultiheadAttention(
config.d_model, config.n_heads, dropout=config.dropout, batch_first=True config.d_model, config.n_heads, dropout=config.dropout, batch_first=True
@ -278,7 +320,7 @@ class TradingTransformerLayer(nn.Module):
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
# Self-attention with residual connection # Self-attention with residual connection
if isinstance(self.attention, MultiScaleAttention): if isinstance(self.attention, DeepMultiScaleAttention):
attn_output = self.attention(x, mask) attn_output = self.attention(x, mask)
else: else:
attn_output, _ = self.attention(x, x, x, attn_mask=mask) attn_output, _ = self.attention(x, x, x, attn_mask=mask)
@ -323,8 +365,11 @@ class AdvancedTradingTransformer(nn.Module):
TradingTransformerLayer(config) for _ in range(config.n_layers) TradingTransformerLayer(config) for _ in range(config.n_layers)
]) ])
# Output heads # Enhanced output heads for 46M parameter model
self.action_head = nn.Sequential( self.action_head = nn.Sequential(
nn.Linear(config.d_model, config.d_model),
nn.GELU(),
nn.Dropout(config.dropout),
nn.Linear(config.d_model, config.d_model // 2), nn.Linear(config.d_model, config.d_model // 2),
nn.GELU(), nn.GELU(),
nn.Dropout(config.dropout), nn.Dropout(config.dropout),
@ -333,25 +378,48 @@ class AdvancedTradingTransformer(nn.Module):
if config.confidence_output: if config.confidence_output:
self.confidence_head = nn.Sequential( self.confidence_head = nn.Sequential(
nn.Linear(config.d_model, config.d_model // 4), nn.Linear(config.d_model, config.d_model // 2),
nn.GELU(),
nn.Dropout(config.dropout),
nn.Linear(config.d_model // 2, config.d_model // 4),
nn.GELU(), nn.GELU(),
nn.Dropout(config.dropout), nn.Dropout(config.dropout),
nn.Linear(config.d_model // 4, 1), nn.Linear(config.d_model // 4, 1),
nn.Sigmoid() nn.Sigmoid()
) )
# Uncertainty estimation # Enhanced uncertainty estimation
if config.use_uncertainty_estimation: if config.use_uncertainty_estimation:
self.uncertainty_estimator = UncertaintyEstimation(config.d_model) self.uncertainty_estimator = UncertaintyEstimation(config.d_model)
# Price prediction head (auxiliary task) # Enhanced price prediction head (auxiliary task)
self.price_head = nn.Sequential( self.price_head = nn.Sequential(
nn.Linear(config.d_model, config.d_model // 4), nn.Linear(config.d_model, config.d_model // 2),
nn.GELU(),
nn.Dropout(config.dropout),
nn.Linear(config.d_model // 2, config.d_model // 4),
nn.GELU(), nn.GELU(),
nn.Dropout(config.dropout), nn.Dropout(config.dropout),
nn.Linear(config.d_model // 4, 1) nn.Linear(config.d_model // 4, 1)
) )
# Additional specialized heads for 46M model
self.volatility_head = nn.Sequential(
nn.Linear(config.d_model, config.d_model // 2),
nn.GELU(),
nn.Dropout(config.dropout),
nn.Linear(config.d_model // 2, 1),
nn.Softplus()
)
self.trend_strength_head = nn.Sequential(
nn.Linear(config.d_model, config.d_model // 2),
nn.GELU(),
nn.Dropout(config.dropout),
nn.Linear(config.d_model // 2, 1),
nn.Tanh()
)
# Initialize weights # Initialize weights
self._init_weights() self._init_weights()
@ -434,10 +502,17 @@ class AdvancedTradingTransformer(nn.Module):
outputs['uncertainty_mean'] = uncertainty_mean outputs['uncertainty_mean'] = uncertainty_mean
outputs['uncertainty_std'] = uncertainty_std outputs['uncertainty_std'] = uncertainty_std
# Price prediction (auxiliary task) # Enhanced price prediction (auxiliary task)
price_pred = self.price_head(pooled) price_pred = self.price_head(pooled)
outputs['price_prediction'] = price_pred outputs['price_prediction'] = price_pred
# Additional specialized predictions for 46M model
volatility_pred = self.volatility_head(pooled)
outputs['volatility_prediction'] = volatility_pred
trend_strength_pred = self.trend_strength_head(pooled)
outputs['trend_strength_prediction'] = trend_strength_pred
# Market regime information # Market regime information
if regime_probs_history: if regime_probs_history:
outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1) outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1)

View File

@ -271,15 +271,15 @@
], ],
"decision": [ "decision": [
{ {
"checkpoint_id": "decision_20250702_011418", "checkpoint_id": "decision_20250702_012558",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011418.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_012558.pt",
"created_at": "2025-07-02T01:14:18.986083", "created_at": "2025-07-02T01:25:58.614455",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999990526608928, "performance_score": 9.999991886192655,
"accuracy": null, "accuracy": null,
"loss": 9.473391072236024e-06, "loss": 8.113807345618998e-06,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -291,15 +291,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_011324", "checkpoint_id": "decision_20250702_012504",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011324.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_012504.pt",
"created_at": "2025-07-02T01:13:24.579781", "created_at": "2025-07-02T01:25:04.285477",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999990382249775, "performance_score": 9.999991852067678,
"accuracy": null, "accuracy": null,
"loss": 9.617750224931245e-06, "loss": 8.147932321987486e-06,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -311,15 +311,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_011348", "checkpoint_id": "decision_20250702_012502",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011348.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
"created_at": "2025-07-02T01:13:48.808520", "created_at": "2025-07-02T01:25:02.958656",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999990223319509, "performance_score": 9.999991847589234,
"accuracy": null, "accuracy": null,
"loss": 9.776680491212022e-06, "loss": 8.152410765381393e-06,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -331,15 +331,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_011333", "checkpoint_id": "decision_20250702_012503",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011333.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_012503.pt",
"created_at": "2025-07-02T01:13:33.679719", "created_at": "2025-07-02T01:25:03.108239",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.999989776407977, "performance_score": 9.99999184242316,
"accuracy": null, "accuracy": null,
"loss": 1.0223592022232505e-05, "loss": 8.157576839933662e-06,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,
@ -351,15 +351,15 @@
"wandb_artifact_name": null "wandb_artifact_name": null
}, },
{ {
"checkpoint_id": "decision_20250702_011411", "checkpoint_id": "decision_20250702_012502",
"model_name": "decision", "model_name": "decision",
"model_type": "decision_fusion", "model_type": "decision_fusion",
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011411.pt", "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
"created_at": "2025-07-02T01:14:11.738925", "created_at": "2025-07-02T01:25:02.603966",
"file_size_mb": 0.06720924377441406, "file_size_mb": 0.06720924377441406,
"performance_score": 9.99998973893185, "performance_score": 9.999991812171043,
"accuracy": null, "accuracy": null,
"loss": 1.0261068149069225e-05, "loss": 8.187828957696905e-06,
"val_accuracy": null, "val_accuracy": null,
"val_loss": null, "val_loss": null,
"reward": null, "reward": null,

View File

@ -239,14 +239,17 @@ class CleanTradingDashboard:
from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig
config = TradingTransformerConfig( config = TradingTransformerConfig(
d_model=256, d_model=1024, # 2x increase for 46M parameters
n_heads=8, n_heads=16, # 2x increase
n_layers=4, n_layers=12, # 2x increase
seq_len=50, seq_len=150, # 1.5x increase
n_actions=3, n_actions=3,
use_multi_scale_attention=True, use_multi_scale_attention=True,
use_market_regime_detection=True, use_market_regime_detection=True,
use_uncertainty_estimation=True use_uncertainty_estimation=True,
use_deep_attention=True,
use_residual_connections=True,
use_layer_norm_variants=True
) )
model, trainer = create_trading_transformer(config) model, trainer = create_trading_transformer(config)
@ -4600,14 +4603,17 @@ class CleanTradingDashboard:
# Create transformer if not exists # Create transformer if not exists
if transformer_model is None or transformer_trainer is None: if transformer_model is None or transformer_trainer is None:
config = TradingTransformerConfig( config = TradingTransformerConfig(
d_model=256, d_model=1024, # 2x increase for 46M parameters
n_heads=8, n_heads=16, # 2x increase
n_layers=4, n_layers=12, # 2x increase
seq_len=50, seq_len=150, # 1.5x increase
n_actions=3, n_actions=3,
use_multi_scale_attention=True, use_multi_scale_attention=True,
use_market_regime_detection=True, use_market_regime_detection=True,
use_uncertainty_estimation=True use_uncertainty_estimation=True,
use_deep_attention=True,
use_residual_connections=True,
use_layer_norm_variants=True
) )
transformer_model, transformer_trainer = create_trading_transformer(config) transformer_model, transformer_trainer = create_trading_transformer(config)