fx T checkpoint and model loss measure

2025-11-10 12:41:39 +02:00
parent 86ae8b499b
commit a2d34c6d7c
2 changed files with 138 additions and 1 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -15,6 +15,7 @@ import logging
 import uuid
 import time
 import threading
 import os
 from typing import Dict, List, Optional, Any
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
@@ -1447,6 +1448,71 @@ class RealTrainingAdapter:
            logger.error(traceback.format_exc())
            return None
    def _find_best_checkpoint(self, checkpoint_dir: str, metric: str = 'accuracy') -> Optional[str]:
        """Find the best checkpoint based on a metric"""
        try:
            if not os.path.exists(checkpoint_dir):
                return None
            checkpoints = []
            for filename in os.listdir(checkpoint_dir):
                if filename.endswith('.pt'):
                    filepath = os.path.join(checkpoint_dir, filename)
                    try:
                        checkpoint = torch.load(filepath, map_location='cpu')
                        checkpoints.append({
                            'path': filepath,
                            'metric_value': checkpoint.get(metric, 0),
                            'epoch': checkpoint.get('epoch', 0)
                        })
                    except Exception as e:
                        logger.debug(f"Could not load checkpoint {filename}: {e}")
            if not checkpoints:
                return None
            # Sort by metric (higher is better for accuracy)
            checkpoints.sort(key=lambda x: x['metric_value'], reverse=True)
            return checkpoints[0]['path']
        except Exception as e:
            logger.error(f"Error finding best checkpoint: {e}")
            return None
    def _cleanup_old_checkpoints(self, checkpoint_dir: str, keep_best: int = 5, metric: str = 'accuracy'):
        """Keep only the best N checkpoints"""
        try:
            if not os.path.exists(checkpoint_dir):
                return
            checkpoints = []
            for filename in os.listdir(checkpoint_dir):
                if filename.endswith('.pt'):
                    filepath = os.path.join(checkpoint_dir, filename)
                    try:
                        checkpoint = torch.load(filepath, map_location='cpu')
                        checkpoints.append({
                            'path': filepath,
                            'metric_value': checkpoint.get(metric, 0),
                            'epoch': checkpoint.get('epoch', 0)
                        })
                    except Exception as e:
                        logger.debug(f"Could not load checkpoint {filename}: {e}")
            # Sort by metric (higher is better)
            checkpoints.sort(key=lambda x: x['metric_value'], reverse=True)
            # Delete checkpoints beyond keep_best
            for checkpoint in checkpoints[keep_best:]:
                try:
                    os.remove(checkpoint['path'])
                    logger.debug(f"Removed old checkpoint: {checkpoint['path']}")
                except Exception as e:
                    logger.warning(f"Could not remove checkpoint: {e}")
        except Exception as e:
            logger.error(f"Error cleaning up checkpoints: {e}")
    def _train_transformer_real(self, session: TrainingSession, training_data: List[Dict]):
        """
        Train Transformer model using orchestrator's existing training infrastructure
@@ -1466,6 +1532,25 @@ class RealTrainingAdapter:
        logger.info(f"Using orchestrator's TradingTransformerTrainer")
        logger.info(f"   Trainer type: {type(trainer).__name__}")
        # Load best checkpoint if available to continue training
        try:
            checkpoint_dir = "models/checkpoints/transformer"
            best_checkpoint_path = self._find_best_checkpoint(checkpoint_dir, metric='accuracy')
            if best_checkpoint_path and os.path.exists(best_checkpoint_path):
                checkpoint = torch.load(best_checkpoint_path)
                trainer.model.load_state_dict(checkpoint['model_state_dict'])
                trainer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                trainer.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
                logger.info(f"   Loaded checkpoint from epoch {checkpoint.get('epoch', 0)}")
                logger.info(f"   Previous best: Loss={checkpoint.get('loss', 0):.6f}, Accuracy={checkpoint.get('accuracy', 0):.2%}")
            else:
                logger.info("   No previous checkpoint found, starting fresh")
        except Exception as e:
            logger.warning(f"   Failed to load checkpoint: {e}")
            logger.info("   Starting with fresh model weights")
        # Use the trainer's train_step method for individual samples
        if hasattr(trainer, 'train_step'):
            logger.info("   Using trainer.train_step() method")
@@ -1549,13 +1634,14 @@ class RealTrainingAdapter:
                        if result is not None:
                            batch_loss = result.get('total_loss', 0.0)
                            batch_accuracy = result.get('accuracy', 0.0)
                            batch_candle_accuracy = result.get('candle_accuracy', 0.0)
                            epoch_loss += batch_loss
                            epoch_accuracy += batch_accuracy
                            num_batches += 1
                            # Log first batch and every 10th batch for debugging
                            if (i + 1) == 1 or (i + 1) % 10 == 0:
-                                logger.info(f"      Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Accuracy: {batch_accuracy:.4f}")
+                                logger.info(f"      Batch {i + 1}/{len(grouped_batches)}, Loss: {batch_loss:.6f}, Action Acc: {batch_accuracy:.2%}, Candle Acc: {batch_candle_accuracy:.2%}")
                        else:
                            logger.warning(f"   Batch {i + 1} returned None result - skipping")
@@ -1577,6 +1663,32 @@ class RealTrainingAdapter:
                session.current_epoch = epoch + 1
                session.current_loss = avg_loss
                # Save checkpoint after each epoch
                try:
                    checkpoint_dir = "models/checkpoints/transformer"
                    os.makedirs(checkpoint_dir, exist_ok=True)
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    checkpoint_path = os.path.join(checkpoint_dir, f"transformer_epoch{epoch+1}_{timestamp}.pt")
                    torch.save({
                        'epoch': epoch + 1,
                        'model_state_dict': trainer.model.state_dict(),
                        'optimizer_state_dict': trainer.optimizer.state_dict(),
                        'scheduler_state_dict': trainer.scheduler.state_dict(),
                        'loss': avg_loss,
                        'accuracy': avg_accuracy,
                        'learning_rate': trainer.scheduler.get_last_lr()[0]
                    }, checkpoint_path)
                    logger.info(f"   Saved checkpoint: {checkpoint_path}")
                    # Keep only best 5 checkpoints based on accuracy
                    self._cleanup_old_checkpoints(checkpoint_dir, keep_best=5, metric='accuracy')
                except Exception as e:
                    logger.warning(f"   Failed to save checkpoint: {e}")
                # Clear CUDA cache after each epoch
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
@@ -1586,6 +1698,16 @@ class RealTrainingAdapter:
            session.final_loss = session.current_loss
            session.accuracy = avg_accuracy
            # Log best checkpoint info
            try:
                checkpoint_dir = "models/checkpoints/transformer"
                best_checkpoint_path = self._find_best_checkpoint(checkpoint_dir, metric='accuracy')
                if best_checkpoint_path:
                    checkpoint = torch.load(best_checkpoint_path, map_location='cpu')
                    logger.info(f"   Best checkpoint: epoch {checkpoint.get('epoch', 0)}, accuracy: {checkpoint.get('accuracy', 0):.2%}")
            except Exception as e:
                logger.debug(f"Could not load best checkpoint info: {e}")
            logger.info(f"   Training complete: Loss = {session.final_loss:.6f}, Accuracy = {session.accuracy:.2%}")
        else:
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -1268,12 +1268,27 @@ class TradingTransformerTrainer:
                predictions = torch.argmax(outputs['action_logits'], dim=-1)
                accuracy = (predictions == batch['actions']).float().mean()
                # Calculate candle prediction accuracy (price direction)
                candle_accuracy = 0.0
                if 'next_candles' in outputs and 'future_prices' in batch:
                    # Use 1m timeframe prediction as primary
                    if '1m' in outputs['next_candles']:
                        predicted_candle = outputs['next_candles']['1m']  # [batch, 5]
                        # Predicted close is the 4th value (index 3)
                        predicted_close_change = predicted_candle[:, 3]  # Predicted close price change
                        actual_close_change = batch['future_prices']  # Actual price change ratio
                        # Check if direction matches (both positive or both negative)
                        direction_match = (torch.sign(predicted_close_change) == torch.sign(actual_close_change)).float()
                        candle_accuracy = direction_match.mean().item()
            # Extract values and delete tensors to free memory
            result = {
                'total_loss': total_loss.item(),
                'action_loss': action_loss.item(),
                'price_loss': price_loss.item(),
                'accuracy': accuracy.item(),
                'candle_accuracy': candle_accuracy,
                'learning_rate': self.scheduler.get_last_lr()[0]
            }