model metadata

2025-11-12 14:37:23 +02:00
parent 4c04503f3e
commit 0c987c3557
4 changed files with 88 additions and 9 deletions
--- a/.kiro/settings/mcp.json
+++ b/.kiro/settings/mcp.json
@@ -0,0 +1,11 @@
+{
+  "mcpServers": {
+    "fetch": {
+      "command": "uvx",
+      "args": ["mcp-server-fetch"],
+      "env": {},
+      "disabled": true,
+      "autoApprove": []
+    }
+  }
+}
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -1723,15 +1723,9 @@ class RealTrainingAdapter:
                        else:
                            logger.warning(f"   Batch {i + 1} returned None result - skipping")
                        
-                        # CRITICAL FIX: Delete batch tensors immediately to free GPU memory
-                        # This prevents memory accumulation during gradient accumulation
-                        for key in list(batch.keys()):
-                            if isinstance(batch[key], torch.Tensor):
-                                del batch[key]
-                        del batch
-                        
                        # CRITICAL: Clear CUDA cache after EVERY batch to prevent memory accumulation
                        # This is essential with large models and limited GPU memory
+                        # NOTE: We don't delete the batch dict itself because it's reused across epochs
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        
@@ -1786,6 +1780,41 @@ class RealTrainingAdapter:
                    
                    logger.info(f"   Saved checkpoint: {checkpoint_path}")
                    
+                    # Save metadata to database for easy retrieval
+                    try:
+                        from utils.database_manager import DatabaseManager
+                        
+                        db_manager = DatabaseManager()
+                        checkpoint_id = f"transformer_e{epoch+1}_{timestamp}"
+                        
+                        # Create metadata object
+                        from utils.database_manager import CheckpointMetadata
+                        metadata = CheckpointMetadata(
+                            checkpoint_id=checkpoint_id,
+                            model_name="transformer",
+                            model_type="transformer",
+                            timestamp=datetime.now(),
+                            performance_metrics={
+                                'loss': float(avg_loss),
+                                'accuracy': float(avg_accuracy),
+                                'epoch': epoch + 1,
+                                'learning_rate': float(trainer.scheduler.get_last_lr()[0])
+                            },
+                            training_metadata={
+                                'num_samples': len(training_data),
+                                'num_batches': num_batches,
+                                'training_id': training_id
+                            },
+                            file_path=checkpoint_path,
+                            performance_score=float(avg_accuracy),  # Use accuracy as score
+                            is_active=True
+                        )
+                        
+                        if db_manager.save_checkpoint_metadata(metadata):
+                            logger.info(f"   Saved checkpoint metadata to database: {checkpoint_id}")
+                    except Exception as meta_error:
+                        logger.warning(f"   Could not save checkpoint metadata: {meta_error}")
+                    
                    # Keep only best 5 checkpoints based on accuracy
                    self._cleanup_old_checkpoints(checkpoint_dir, keep_best=5, metric='accuracy')
                    
--- a/ANNOTATE/web/app.py
+++ b/ANNOTATE/web/app.py
@@ -193,7 +193,7 @@ class AnnotationDashboard:
    def _get_best_checkpoint_info(self, model_name: str) -> Optional[Dict]:
        """
        Get best checkpoint info for a model without loading it
-        Uses filename parsing instead of torch.load to avoid crashes
+        First tries database, then falls back to filename parsing
        
        Args:
            model_name: Name of the model
@@ -202,6 +202,41 @@ class AnnotationDashboard:
            Dict with checkpoint info or None if no checkpoint found
        """
        try:
+            # Try to get from database first (has full metadata)
+            try:
+                from utils.database_manager import DatabaseManager
+                db_manager = DatabaseManager()
+                
+                # Get active checkpoint for this model
+                with db_manager._get_connection() as conn:
+                    cursor = conn.execute("""
+                        SELECT checkpoint_id, performance_metrics, timestamp, file_path
+                        FROM checkpoint_metadata
+                        WHERE model_name = ? AND is_active = TRUE
+                        ORDER BY performance_score DESC
+                        LIMIT 1
+                    """, (model_name.lower(),))
+                    
+                    row = cursor.fetchone()
+                    if row:
+                        import json
+                        checkpoint_id, metrics_json, timestamp, file_path = row
+                        metrics = json.loads(metrics_json) if metrics_json else {}
+                        
+                        checkpoint_info = {
+                            'filename': os.path.basename(file_path) if file_path else checkpoint_id,
+                            'epoch': metrics.get('epoch', 0),
+                            'loss': metrics.get('loss'),
+                            'accuracy': metrics.get('accuracy'),
+                            'source': 'database'
+                        }
+                        
+                        logger.info(f"Loaded checkpoint info from database for {model_name}: E{checkpoint_info['epoch']}, Loss={checkpoint_info['loss']}, Acc={checkpoint_info['accuracy']}")
+                        return checkpoint_info
+            except Exception as db_error:
+                logger.debug(f"Could not load from database: {db_error}")
+            
+            # Fallback to filename parsing
            import glob
            import re
            
--- a/_dev/dev_notes.md
+++ b/_dev/dev_notes.md
@@ -121,3 +121,7 @@ Let's use the mean squared difference between the prediction and the empirical o
 ----------
 can we check the "live inference" mode now. it should to a realtime inference/training each second (as much barches as can pass in 1s) and prediction should be next candle - training will be retrospective with 1 candle delay (called each s, m, h and d for the previous candle when we know the result)
 calculate the angle between each 2 candles features and train to predict those (top- top; open -open, etc.)
+
+
+use this for sentiment analysis:
+https://www.coinglass.com/LongShortRatio