i think we fixed mexc interface at the end!!!

2025-07-04 02:14:29 +03:00
parent 978cecf0c5
commit cf91e090c8
7 changed files with 445 additions and 613 deletions
--- a/utils/checkpoint_manager.py
+++ b/utils/checkpoint_manager.py
@@ -133,19 +133,37 @@ class CheckpointManager:
    
    def load_best_checkpoint(self, model_name: str) -> Optional[Tuple[str, CheckpointMetadata]]:
        try:
-            if model_name not in self.checkpoints or not self.checkpoints[model_name]:
-                logger.warning(f"No checkpoints found for model: {model_name}")
-                return None
+            # First, try the standard checkpoint system
+            if model_name in self.checkpoints and self.checkpoints[model_name]:
+                # Filter out checkpoints with non-existent files
+                valid_checkpoints = [
+                    cp for cp in self.checkpoints[model_name] 
+                    if Path(cp.file_path).exists()
+                ]
+                
+                if valid_checkpoints:
+                    best_checkpoint = max(valid_checkpoints, key=lambda x: x.performance_score)
+                    logger.debug(f"Loading best checkpoint for {model_name}: {best_checkpoint.checkpoint_id}")
+                    return best_checkpoint.file_path, best_checkpoint
+                else:
+                    # Clean up invalid metadata entries
+                    invalid_count = len(self.checkpoints[model_name])
+                    logger.warning(f"Found {invalid_count} invalid checkpoint entries for {model_name}, cleaning up metadata")
+                    self.checkpoints[model_name] = []
+                    self._save_metadata()
            
-            best_checkpoint = max(self.checkpoints[model_name], key=lambda x: x.performance_score)
+            # Fallback: Look for existing saved models in the legacy format
+            logger.debug(f"No valid checkpoints found for model: {model_name}, attempting to find legacy saved models")
+            legacy_model_path = self._find_legacy_model(model_name)
            
-            if not Path(best_checkpoint.file_path).exists():
-                # temporary disable logging to avoid spam 
-                # logger.error(f"Best checkpoint file not found: {best_checkpoint.file_path}")
-                return None
+            if legacy_model_path:
+                # Create checkpoint metadata for the legacy model using actual file data
+                legacy_metadata = self._create_legacy_metadata(model_name, legacy_model_path)
+                logger.debug(f"Found legacy model for {model_name}: {legacy_model_path}")
+                return str(legacy_model_path), legacy_metadata
            
-            logger.debug(f"Loading best checkpoint for {model_name}: {best_checkpoint.checkpoint_id}")
-            return best_checkpoint.file_path, best_checkpoint
+            logger.warning(f"No checkpoints or legacy models found for: {model_name}")
+            return None
            
        except Exception as e:
            logger.error(f"Error loading best checkpoint for {model_name}: {e}")
@@ -181,16 +199,8 @@ class CheckpointManager:
            # Bonus for processing more training samples
            score += min(10, metrics['training_samples'] / 10)
        
-        # Ensure minimum score for any training activity
-        if score == 0.0 and metrics:
-            # Use the first available metric with better scaling
-            first_metric = next(iter(metrics.values()))
-            if first_metric > 0:
-                score = max(0.1, min(10, first_metric))
-            else:
-                score = 0.1
-        
-        return max(score, 0.1)
+        # Return actual calculated score - NO SYNTHETIC MINIMUM
+        return score
    
    def _should_save_checkpoint(self, model_name: str, performance_score: float) -> bool:
        """Improved checkpoint saving logic with more frequent saves during training"""
@@ -331,6 +341,110 @@ class CheckpointManager:
            stats['total_size_mb'] += model_size
        
        return stats
+    
+    def _find_legacy_model(self, model_name: str) -> Optional[Path]:
+        """Find legacy saved models based on model name patterns"""
+        base_dir = Path(self.base_dir)
+        
+        # Define model name mappings and patterns for legacy files
+        legacy_patterns = {
+            'dqn_agent': [
+                'dqn_agent_best_policy.pt',
+                'enhanced_dqn_best_policy.pt',
+                'improved_dqn_agent_best_policy.pt',
+                'dqn_agent_final_policy.pt'
+            ],
+            'enhanced_cnn': [
+                'cnn_model_best.pt',
+                'optimized_short_term_model_best.pt',
+                'optimized_short_term_model_realtime_best.pt',
+                'optimized_short_term_model_ticks_best.pt'
+            ],
+            'extrema_trainer': [
+                'supervised_model_best.pt'
+            ],
+            'cob_rl': [
+                'best_rl_model.pth_policy.pt',
+                'rl_agent_best_policy.pt'
+            ],
+            'decision': [
+                # Decision models might be in subdirectories, but let's check main dir too
+                'decision_best.pt',
+                'decision_model_best.pt',
+                # Check for transformer models which might be used as decision models
+                'enhanced_dqn_best_policy.pt',
+                'improved_dqn_agent_best_policy.pt'
+            ]
+        }
+        
+        # Get patterns for this model name
+        patterns = legacy_patterns.get(model_name, [])
+        
+        # Also try generic patterns based on model name
+        patterns.extend([
+            f'{model_name}_best.pt',
+            f'{model_name}_best_policy.pt',
+            f'{model_name}_final.pt',
+            f'{model_name}_final_policy.pt'
+        ])
+        
+        # Search for the model files
+        for pattern in patterns:
+            candidate_path = base_dir / pattern
+            if candidate_path.exists():
+                logger.debug(f"Found legacy model file: {candidate_path}")
+                return candidate_path
+        
+        # Also check subdirectories
+        for subdir in base_dir.iterdir():
+            if subdir.is_dir() and subdir.name == model_name:
+                for pattern in patterns:
+                    candidate_path = subdir / pattern
+                    if candidate_path.exists():
+                        logger.debug(f"Found legacy model file in subdirectory: {candidate_path}")
+                        return candidate_path
+        
+        return None
+    
+    def _create_legacy_metadata(self, model_name: str, file_path: Path) -> CheckpointMetadata:
+        """Create metadata for legacy model files using only actual file information"""
+        try:
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)
+            created_time = datetime.fromtimestamp(file_path.stat().st_mtime)
+            
+            # NO SYNTHETIC DATA - use only actual file information
+            return CheckpointMetadata(
+                checkpoint_id=f"legacy_{model_name}_{int(created_time.timestamp())}",
+                model_name=model_name,
+                model_type=model_name,
+                file_path=str(file_path),
+                created_at=created_time,
+                file_size_mb=file_size_mb,
+                performance_score=0.0,  # Unknown performance - use 0, not synthetic values
+                accuracy=None,
+                loss=None,
+                val_accuracy=None,
+                val_loss=None,
+                reward=None,
+                pnl=None,
+                epoch=None,
+                training_time_hours=None,
+                total_parameters=None,
+                wandb_run_id=None,
+                wandb_artifact_name=None
+            )
+        except Exception as e:
+            logger.error(f"Error creating legacy metadata for {model_name}: {e}")
+            # Return a basic metadata with minimal info - NO SYNTHETIC VALUES
+            return CheckpointMetadata(
+                checkpoint_id=f"legacy_{model_name}",
+                model_name=model_name,
+                model_type=model_name,
+                file_path=str(file_path),
+                created_at=datetime.now(),
+                file_size_mb=0.0,
+                performance_score=0.0  # Unknown - use 0, not synthetic
+            )

 _checkpoint_manager = None