beef up T model
This commit is contained in:
@ -23,35 +23,40 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TradingTransformerConfig:
|
class TradingTransformerConfig:
|
||||||
"""Configuration for trading transformer models"""
|
"""Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
|
||||||
# Model architecture
|
# Model architecture - SCALED UP
|
||||||
d_model: int = 512 # Model dimension
|
d_model: int = 1024 # Model dimension (2x increase)
|
||||||
n_heads: int = 8 # Number of attention heads
|
n_heads: int = 16 # Number of attention heads (2x increase)
|
||||||
n_layers: int = 6 # Number of transformer layers
|
n_layers: int = 12 # Number of transformer layers (2x increase)
|
||||||
d_ff: int = 2048 # Feed-forward dimension
|
d_ff: int = 4096 # Feed-forward dimension (2x increase)
|
||||||
dropout: float = 0.1 # Dropout rate
|
dropout: float = 0.1 # Dropout rate
|
||||||
|
|
||||||
# Input dimensions
|
# Input dimensions - ENHANCED
|
||||||
seq_len: int = 100 # Sequence length for time series
|
seq_len: int = 150 # Sequence length for time series (1.5x increase)
|
||||||
cob_features: int = 50 # COB feature dimension
|
cob_features: int = 100 # COB feature dimension (2x increase)
|
||||||
tech_features: int = 20 # Technical indicator features
|
tech_features: int = 40 # Technical indicator features (2x increase)
|
||||||
market_features: int = 15 # Market microstructure features
|
market_features: int = 30 # Market microstructure features (2x increase)
|
||||||
|
|
||||||
# Output configuration
|
# Output configuration
|
||||||
n_actions: int = 3 # BUY, SELL, HOLD
|
n_actions: int = 3 # BUY, SELL, HOLD
|
||||||
confidence_output: bool = True # Output confidence scores
|
confidence_output: bool = True # Output confidence scores
|
||||||
|
|
||||||
# Training configuration
|
# Training configuration - OPTIMIZED FOR LARGER MODEL
|
||||||
learning_rate: float = 1e-4
|
learning_rate: float = 5e-5 # Reduced for larger model
|
||||||
weight_decay: float = 1e-5
|
weight_decay: float = 1e-4 # Increased regularization
|
||||||
warmup_steps: int = 4000
|
warmup_steps: int = 8000 # More warmup steps
|
||||||
max_grad_norm: float = 1.0
|
max_grad_norm: float = 0.5 # Tighter gradient clipping
|
||||||
|
|
||||||
# Advanced features
|
# Advanced features - ENHANCED
|
||||||
use_relative_position: bool = True
|
use_relative_position: bool = True
|
||||||
use_multi_scale_attention: bool = True
|
use_multi_scale_attention: bool = True
|
||||||
use_market_regime_detection: bool = True
|
use_market_regime_detection: bool = True
|
||||||
use_uncertainty_estimation: bool = True
|
use_uncertainty_estimation: bool = True
|
||||||
|
|
||||||
|
# NEW: Additional scaling features
|
||||||
|
use_deep_attention: bool = True # Deeper attention mechanisms
|
||||||
|
use_residual_connections: bool = True # Enhanced residual connections
|
||||||
|
use_layer_norm_variants: bool = True # Advanced normalization
|
||||||
|
|
||||||
class PositionalEncoding(nn.Module):
|
class PositionalEncoding(nn.Module):
|
||||||
"""Sinusoidal positional encoding for transformer"""
|
"""Sinusoidal positional encoding for transformer"""
|
||||||
@ -102,10 +107,10 @@ class RelativePositionalEncoding(nn.Module):
|
|||||||
|
|
||||||
return self.relative_position_embeddings(final_mat)
|
return self.relative_position_embeddings(final_mat)
|
||||||
|
|
||||||
class MultiScaleAttention(nn.Module):
|
class DeepMultiScaleAttention(nn.Module):
|
||||||
"""Multi-scale attention for capturing different time horizons"""
|
"""Enhanced multi-scale attention with deeper mechanisms for 46M parameter model"""
|
||||||
|
|
||||||
def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7]):
|
def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7, 11, 15]):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
self.n_heads = n_heads
|
self.n_heads = n_heads
|
||||||
@ -114,18 +119,49 @@ class MultiScaleAttention(nn.Module):
|
|||||||
|
|
||||||
assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
|
assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
|
||||||
|
|
||||||
# Multi-scale projections
|
# Enhanced multi-scale projections with deeper architecture
|
||||||
self.scale_projections = nn.ModuleList([
|
self.scale_projections = nn.ModuleList([
|
||||||
nn.ModuleDict({
|
nn.ModuleDict({
|
||||||
'query': nn.Linear(d_model, d_model),
|
'query': nn.Sequential(
|
||||||
'key': nn.Linear(d_model, d_model),
|
nn.Linear(d_model, d_model * 2),
|
||||||
'value': nn.Linear(d_model, d_model),
|
nn.GELU(),
|
||||||
'conv': nn.Conv1d(d_model, d_model, kernel_size=scale,
|
nn.Dropout(0.1),
|
||||||
padding=scale//2, groups=d_model)
|
nn.Linear(d_model * 2, d_model)
|
||||||
|
),
|
||||||
|
'key': nn.Sequential(
|
||||||
|
nn.Linear(d_model, d_model * 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(0.1),
|
||||||
|
nn.Linear(d_model * 2, d_model)
|
||||||
|
),
|
||||||
|
'value': nn.Sequential(
|
||||||
|
nn.Linear(d_model, d_model * 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(0.1),
|
||||||
|
nn.Linear(d_model * 2, d_model)
|
||||||
|
),
|
||||||
|
'conv': nn.Sequential(
|
||||||
|
nn.Conv1d(d_model, d_model * 2, kernel_size=scale,
|
||||||
|
padding=scale//2, groups=d_model),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Conv1d(d_model * 2, d_model, kernel_size=1)
|
||||||
|
)
|
||||||
}) for scale in scales
|
}) for scale in scales
|
||||||
])
|
])
|
||||||
|
|
||||||
self.output_projection = nn.Linear(d_model * len(scales), d_model)
|
# Enhanced output projection with residual connection
|
||||||
|
self.output_projection = nn.Sequential(
|
||||||
|
nn.Linear(d_model * len(scales), d_model * 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(0.1),
|
||||||
|
nn.Linear(d_model * 2, d_model)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Additional attention mechanisms
|
||||||
|
self.cross_scale_attention = nn.MultiheadAttention(
|
||||||
|
d_model, n_heads // 2, dropout=0.1, batch_first=True
|
||||||
|
)
|
||||||
|
|
||||||
self.dropout = nn.Dropout(0.1)
|
self.dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||||
@ -133,10 +169,10 @@ class MultiScaleAttention(nn.Module):
|
|||||||
scale_outputs = []
|
scale_outputs = []
|
||||||
|
|
||||||
for scale_proj in self.scale_projections:
|
for scale_proj in self.scale_projections:
|
||||||
# Apply temporal convolution for this scale
|
# Apply enhanced temporal convolution for this scale
|
||||||
x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
|
x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
|
||||||
|
|
||||||
# Standard attention computation
|
# Enhanced attention computation with deeper projections
|
||||||
Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
||||||
K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
||||||
V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
|
||||||
@ -160,9 +196,15 @@ class MultiScaleAttention(nn.Module):
|
|||||||
|
|
||||||
scale_outputs.append(output)
|
scale_outputs.append(output)
|
||||||
|
|
||||||
# Combine multi-scale outputs
|
# Combine multi-scale outputs with enhanced projection
|
||||||
combined = torch.cat(scale_outputs, dim=-1)
|
combined = torch.cat(scale_outputs, dim=-1)
|
||||||
return self.output_projection(combined)
|
output = self.output_projection(combined)
|
||||||
|
|
||||||
|
# Apply cross-scale attention for better integration
|
||||||
|
cross_attended, _ = self.cross_scale_attention(output, output, output, attn_mask=mask)
|
||||||
|
|
||||||
|
# Residual connection
|
||||||
|
return output + cross_attended
|
||||||
|
|
||||||
class MarketRegimeDetector(nn.Module):
|
class MarketRegimeDetector(nn.Module):
|
||||||
"""Market regime detection module for adaptive behavior"""
|
"""Market regime detection module for adaptive behavior"""
|
||||||
@ -249,9 +291,9 @@ class TradingTransformerLayer(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
# Multi-scale attention or standard attention
|
# Enhanced multi-scale attention or standard attention
|
||||||
if config.use_multi_scale_attention:
|
if config.use_multi_scale_attention:
|
||||||
self.attention = MultiScaleAttention(config.d_model, config.n_heads)
|
self.attention = DeepMultiScaleAttention(config.d_model, config.n_heads)
|
||||||
else:
|
else:
|
||||||
self.attention = nn.MultiheadAttention(
|
self.attention = nn.MultiheadAttention(
|
||||||
config.d_model, config.n_heads, dropout=config.dropout, batch_first=True
|
config.d_model, config.n_heads, dropout=config.dropout, batch_first=True
|
||||||
@ -278,7 +320,7 @@ class TradingTransformerLayer(nn.Module):
|
|||||||
|
|
||||||
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||||
# Self-attention with residual connection
|
# Self-attention with residual connection
|
||||||
if isinstance(self.attention, MultiScaleAttention):
|
if isinstance(self.attention, DeepMultiScaleAttention):
|
||||||
attn_output = self.attention(x, mask)
|
attn_output = self.attention(x, mask)
|
||||||
else:
|
else:
|
||||||
attn_output, _ = self.attention(x, x, x, attn_mask=mask)
|
attn_output, _ = self.attention(x, x, x, attn_mask=mask)
|
||||||
@ -323,8 +365,11 @@ class AdvancedTradingTransformer(nn.Module):
|
|||||||
TradingTransformerLayer(config) for _ in range(config.n_layers)
|
TradingTransformerLayer(config) for _ in range(config.n_layers)
|
||||||
])
|
])
|
||||||
|
|
||||||
# Output heads
|
# Enhanced output heads for 46M parameter model
|
||||||
self.action_head = nn.Sequential(
|
self.action_head = nn.Sequential(
|
||||||
|
nn.Linear(config.d_model, config.d_model),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(config.dropout),
|
||||||
nn.Linear(config.d_model, config.d_model // 2),
|
nn.Linear(config.d_model, config.d_model // 2),
|
||||||
nn.GELU(),
|
nn.GELU(),
|
||||||
nn.Dropout(config.dropout),
|
nn.Dropout(config.dropout),
|
||||||
@ -333,25 +378,48 @@ class AdvancedTradingTransformer(nn.Module):
|
|||||||
|
|
||||||
if config.confidence_output:
|
if config.confidence_output:
|
||||||
self.confidence_head = nn.Sequential(
|
self.confidence_head = nn.Sequential(
|
||||||
nn.Linear(config.d_model, config.d_model // 4),
|
nn.Linear(config.d_model, config.d_model // 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(config.dropout),
|
||||||
|
nn.Linear(config.d_model // 2, config.d_model // 4),
|
||||||
nn.GELU(),
|
nn.GELU(),
|
||||||
nn.Dropout(config.dropout),
|
nn.Dropout(config.dropout),
|
||||||
nn.Linear(config.d_model // 4, 1),
|
nn.Linear(config.d_model // 4, 1),
|
||||||
nn.Sigmoid()
|
nn.Sigmoid()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Uncertainty estimation
|
# Enhanced uncertainty estimation
|
||||||
if config.use_uncertainty_estimation:
|
if config.use_uncertainty_estimation:
|
||||||
self.uncertainty_estimator = UncertaintyEstimation(config.d_model)
|
self.uncertainty_estimator = UncertaintyEstimation(config.d_model)
|
||||||
|
|
||||||
# Price prediction head (auxiliary task)
|
# Enhanced price prediction head (auxiliary task)
|
||||||
self.price_head = nn.Sequential(
|
self.price_head = nn.Sequential(
|
||||||
nn.Linear(config.d_model, config.d_model // 4),
|
nn.Linear(config.d_model, config.d_model // 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(config.dropout),
|
||||||
|
nn.Linear(config.d_model // 2, config.d_model // 4),
|
||||||
nn.GELU(),
|
nn.GELU(),
|
||||||
nn.Dropout(config.dropout),
|
nn.Dropout(config.dropout),
|
||||||
nn.Linear(config.d_model // 4, 1)
|
nn.Linear(config.d_model // 4, 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Additional specialized heads for 46M model
|
||||||
|
self.volatility_head = nn.Sequential(
|
||||||
|
nn.Linear(config.d_model, config.d_model // 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(config.dropout),
|
||||||
|
nn.Linear(config.d_model // 2, 1),
|
||||||
|
nn.Softplus()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.trend_strength_head = nn.Sequential(
|
||||||
|
nn.Linear(config.d_model, config.d_model // 2),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Dropout(config.dropout),
|
||||||
|
nn.Linear(config.d_model // 2, 1),
|
||||||
|
nn.Tanh()
|
||||||
|
)
|
||||||
|
|
||||||
# Initialize weights
|
# Initialize weights
|
||||||
self._init_weights()
|
self._init_weights()
|
||||||
|
|
||||||
@ -434,10 +502,17 @@ class AdvancedTradingTransformer(nn.Module):
|
|||||||
outputs['uncertainty_mean'] = uncertainty_mean
|
outputs['uncertainty_mean'] = uncertainty_mean
|
||||||
outputs['uncertainty_std'] = uncertainty_std
|
outputs['uncertainty_std'] = uncertainty_std
|
||||||
|
|
||||||
# Price prediction (auxiliary task)
|
# Enhanced price prediction (auxiliary task)
|
||||||
price_pred = self.price_head(pooled)
|
price_pred = self.price_head(pooled)
|
||||||
outputs['price_prediction'] = price_pred
|
outputs['price_prediction'] = price_pred
|
||||||
|
|
||||||
|
# Additional specialized predictions for 46M model
|
||||||
|
volatility_pred = self.volatility_head(pooled)
|
||||||
|
outputs['volatility_prediction'] = volatility_pred
|
||||||
|
|
||||||
|
trend_strength_pred = self.trend_strength_head(pooled)
|
||||||
|
outputs['trend_strength_prediction'] = trend_strength_pred
|
||||||
|
|
||||||
# Market regime information
|
# Market regime information
|
||||||
if regime_probs_history:
|
if regime_probs_history:
|
||||||
outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1)
|
outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1)
|
||||||
|
@ -271,15 +271,15 @@
|
|||||||
],
|
],
|
||||||
"decision": [
|
"decision": [
|
||||||
{
|
{
|
||||||
"checkpoint_id": "decision_20250702_011418",
|
"checkpoint_id": "decision_20250702_012558",
|
||||||
"model_name": "decision",
|
"model_name": "decision",
|
||||||
"model_type": "decision_fusion",
|
"model_type": "decision_fusion",
|
||||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011418.pt",
|
"file_path": "NN\\models\\saved\\decision\\decision_20250702_012558.pt",
|
||||||
"created_at": "2025-07-02T01:14:18.986083",
|
"created_at": "2025-07-02T01:25:58.614455",
|
||||||
"file_size_mb": 0.06720924377441406,
|
"file_size_mb": 0.06720924377441406,
|
||||||
"performance_score": 9.999990526608928,
|
"performance_score": 9.999991886192655,
|
||||||
"accuracy": null,
|
"accuracy": null,
|
||||||
"loss": 9.473391072236024e-06,
|
"loss": 8.113807345618998e-06,
|
||||||
"val_accuracy": null,
|
"val_accuracy": null,
|
||||||
"val_loss": null,
|
"val_loss": null,
|
||||||
"reward": null,
|
"reward": null,
|
||||||
@ -291,15 +291,15 @@
|
|||||||
"wandb_artifact_name": null
|
"wandb_artifact_name": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checkpoint_id": "decision_20250702_011324",
|
"checkpoint_id": "decision_20250702_012504",
|
||||||
"model_name": "decision",
|
"model_name": "decision",
|
||||||
"model_type": "decision_fusion",
|
"model_type": "decision_fusion",
|
||||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011324.pt",
|
"file_path": "NN\\models\\saved\\decision\\decision_20250702_012504.pt",
|
||||||
"created_at": "2025-07-02T01:13:24.579781",
|
"created_at": "2025-07-02T01:25:04.285477",
|
||||||
"file_size_mb": 0.06720924377441406,
|
"file_size_mb": 0.06720924377441406,
|
||||||
"performance_score": 9.999990382249775,
|
"performance_score": 9.999991852067678,
|
||||||
"accuracy": null,
|
"accuracy": null,
|
||||||
"loss": 9.617750224931245e-06,
|
"loss": 8.147932321987486e-06,
|
||||||
"val_accuracy": null,
|
"val_accuracy": null,
|
||||||
"val_loss": null,
|
"val_loss": null,
|
||||||
"reward": null,
|
"reward": null,
|
||||||
@ -311,15 +311,15 @@
|
|||||||
"wandb_artifact_name": null
|
"wandb_artifact_name": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checkpoint_id": "decision_20250702_011348",
|
"checkpoint_id": "decision_20250702_012502",
|
||||||
"model_name": "decision",
|
"model_name": "decision",
|
||||||
"model_type": "decision_fusion",
|
"model_type": "decision_fusion",
|
||||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011348.pt",
|
"file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
|
||||||
"created_at": "2025-07-02T01:13:48.808520",
|
"created_at": "2025-07-02T01:25:02.958656",
|
||||||
"file_size_mb": 0.06720924377441406,
|
"file_size_mb": 0.06720924377441406,
|
||||||
"performance_score": 9.999990223319509,
|
"performance_score": 9.999991847589234,
|
||||||
"accuracy": null,
|
"accuracy": null,
|
||||||
"loss": 9.776680491212022e-06,
|
"loss": 8.152410765381393e-06,
|
||||||
"val_accuracy": null,
|
"val_accuracy": null,
|
||||||
"val_loss": null,
|
"val_loss": null,
|
||||||
"reward": null,
|
"reward": null,
|
||||||
@ -331,15 +331,15 @@
|
|||||||
"wandb_artifact_name": null
|
"wandb_artifact_name": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checkpoint_id": "decision_20250702_011333",
|
"checkpoint_id": "decision_20250702_012503",
|
||||||
"model_name": "decision",
|
"model_name": "decision",
|
||||||
"model_type": "decision_fusion",
|
"model_type": "decision_fusion",
|
||||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011333.pt",
|
"file_path": "NN\\models\\saved\\decision\\decision_20250702_012503.pt",
|
||||||
"created_at": "2025-07-02T01:13:33.679719",
|
"created_at": "2025-07-02T01:25:03.108239",
|
||||||
"file_size_mb": 0.06720924377441406,
|
"file_size_mb": 0.06720924377441406,
|
||||||
"performance_score": 9.999989776407977,
|
"performance_score": 9.99999184242316,
|
||||||
"accuracy": null,
|
"accuracy": null,
|
||||||
"loss": 1.0223592022232505e-05,
|
"loss": 8.157576839933662e-06,
|
||||||
"val_accuracy": null,
|
"val_accuracy": null,
|
||||||
"val_loss": null,
|
"val_loss": null,
|
||||||
"reward": null,
|
"reward": null,
|
||||||
@ -351,15 +351,15 @@
|
|||||||
"wandb_artifact_name": null
|
"wandb_artifact_name": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"checkpoint_id": "decision_20250702_011411",
|
"checkpoint_id": "decision_20250702_012502",
|
||||||
"model_name": "decision",
|
"model_name": "decision",
|
||||||
"model_type": "decision_fusion",
|
"model_type": "decision_fusion",
|
||||||
"file_path": "NN\\models\\saved\\decision\\decision_20250702_011411.pt",
|
"file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
|
||||||
"created_at": "2025-07-02T01:14:11.738925",
|
"created_at": "2025-07-02T01:25:02.603966",
|
||||||
"file_size_mb": 0.06720924377441406,
|
"file_size_mb": 0.06720924377441406,
|
||||||
"performance_score": 9.99998973893185,
|
"performance_score": 9.999991812171043,
|
||||||
"accuracy": null,
|
"accuracy": null,
|
||||||
"loss": 1.0261068149069225e-05,
|
"loss": 8.187828957696905e-06,
|
||||||
"val_accuracy": null,
|
"val_accuracy": null,
|
||||||
"val_loss": null,
|
"val_loss": null,
|
||||||
"reward": null,
|
"reward": null,
|
||||||
|
@ -239,14 +239,17 @@ class CleanTradingDashboard:
|
|||||||
from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig
|
from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig
|
||||||
|
|
||||||
config = TradingTransformerConfig(
|
config = TradingTransformerConfig(
|
||||||
d_model=256,
|
d_model=1024, # 2x increase for 46M parameters
|
||||||
n_heads=8,
|
n_heads=16, # 2x increase
|
||||||
n_layers=4,
|
n_layers=12, # 2x increase
|
||||||
seq_len=50,
|
seq_len=150, # 1.5x increase
|
||||||
n_actions=3,
|
n_actions=3,
|
||||||
use_multi_scale_attention=True,
|
use_multi_scale_attention=True,
|
||||||
use_market_regime_detection=True,
|
use_market_regime_detection=True,
|
||||||
use_uncertainty_estimation=True
|
use_uncertainty_estimation=True,
|
||||||
|
use_deep_attention=True,
|
||||||
|
use_residual_connections=True,
|
||||||
|
use_layer_norm_variants=True
|
||||||
)
|
)
|
||||||
|
|
||||||
model, trainer = create_trading_transformer(config)
|
model, trainer = create_trading_transformer(config)
|
||||||
@ -4600,14 +4603,17 @@ class CleanTradingDashboard:
|
|||||||
# Create transformer if not exists
|
# Create transformer if not exists
|
||||||
if transformer_model is None or transformer_trainer is None:
|
if transformer_model is None or transformer_trainer is None:
|
||||||
config = TradingTransformerConfig(
|
config = TradingTransformerConfig(
|
||||||
d_model=256,
|
d_model=1024, # 2x increase for 46M parameters
|
||||||
n_heads=8,
|
n_heads=16, # 2x increase
|
||||||
n_layers=4,
|
n_layers=12, # 2x increase
|
||||||
seq_len=50,
|
seq_len=150, # 1.5x increase
|
||||||
n_actions=3,
|
n_actions=3,
|
||||||
use_multi_scale_attention=True,
|
use_multi_scale_attention=True,
|
||||||
use_market_regime_detection=True,
|
use_market_regime_detection=True,
|
||||||
use_uncertainty_estimation=True
|
use_uncertainty_estimation=True,
|
||||||
|
use_deep_attention=True,
|
||||||
|
use_residual_connections=True,
|
||||||
|
use_layer_norm_variants=True
|
||||||
)
|
)
|
||||||
|
|
||||||
transformer_model, transformer_trainer = create_trading_transformer(config)
|
transformer_model, transformer_trainer = create_trading_transformer(config)
|
||||||
|
Reference in New Issue
Block a user