This commit is contained in:
Dobromir Popov
2025-11-11 12:21:16 +02:00
parent 6a951e4d7e
commit a6bd5f64ff
2 changed files with 63 additions and 30 deletions

View File

@@ -193,6 +193,7 @@ class AnnotationDashboard:
def _get_best_checkpoint_info(self, model_name: str) -> Optional[Dict]: def _get_best_checkpoint_info(self, model_name: str) -> Optional[Dict]:
""" """
Get best checkpoint info for a model without loading it Get best checkpoint info for a model without loading it
Uses filename parsing instead of torch.load to avoid crashes
Args: Args:
model_name: Name of the model model_name: Name of the model
@@ -201,8 +202,8 @@ class AnnotationDashboard:
Dict with checkpoint info or None if no checkpoint found Dict with checkpoint info or None if no checkpoint found
""" """
try: try:
import torch
import glob import glob
import re
# Map model names to checkpoint directories # Map model names to checkpoint directories
checkpoint_dirs = { checkpoint_dirs = {
@@ -212,37 +213,51 @@ class AnnotationDashboard:
} }
checkpoint_dir = checkpoint_dirs.get(model_name) checkpoint_dir = checkpoint_dirs.get(model_name)
if not checkpoint_dir or not os.path.exists(checkpoint_dir): if not checkpoint_dir:
return None
if not os.path.exists(checkpoint_dir):
logger.debug(f"Checkpoint directory not found: {checkpoint_dir}")
return None return None
# Find all checkpoint files # Find all checkpoint files
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, '*.pt')) checkpoint_files = glob.glob(os.path.join(checkpoint_dir, '*.pt'))
if not checkpoint_files: if not checkpoint_files:
logger.debug(f"No checkpoint files found in {checkpoint_dir}")
return None return None
# Load metadata from each checkpoint and find best logger.debug(f"Found {len(checkpoint_files)} checkpoints for {model_name}")
# Parse filenames to extract epoch info
# Format: transformer_epoch5_20251110_123620.pt
best_checkpoint = None best_checkpoint = None
best_accuracy = -1 best_epoch = -1
for cp_file in checkpoint_files: for cp_file in checkpoint_files:
try: try:
# Load only metadata (map_location='cpu' to avoid GPU) filename = os.path.basename(cp_file)
checkpoint = torch.load(cp_file, map_location='cpu')
accuracy = checkpoint.get('accuracy', 0.0) # Extract epoch number from filename
if accuracy > best_accuracy: match = re.search(r'epoch(\d+)', filename, re.IGNORECASE)
best_accuracy = accuracy if match:
epoch = int(match.group(1))
if epoch > best_epoch:
best_epoch = epoch
best_checkpoint = { best_checkpoint = {
'filename': os.path.basename(cp_file), 'filename': filename,
'epoch': checkpoint.get('epoch', 0), 'epoch': epoch,
'loss': checkpoint.get('loss', 0.0), 'loss': None, # Can't get without loading
'accuracy': accuracy, 'accuracy': None, # Can't get without loading
'learning_rate': checkpoint.get('learning_rate', 0.0) 'source': 'filename'
} }
logger.debug(f"Found checkpoint: {filename}, epoch {epoch}")
except Exception as e: except Exception as e:
logger.debug(f"Could not load checkpoint {cp_file}: {e}") logger.debug(f"Could not parse checkpoint {cp_file}: {e}")
continue continue
if best_checkpoint:
logger.info(f"Best checkpoint for {model_name}: {best_checkpoint['filename']} (E{best_checkpoint['epoch']})")
return best_checkpoint return best_checkpoint
except Exception as e: except Exception as e:
@@ -1305,15 +1320,16 @@ class AnnotationDashboard:
'source': 'loaded' 'source': 'loaded'
} }
# If not loaded, try to read best checkpoint from disk # If not loaded, try to read best checkpoint from disk (filename parsing only)
if not checkpoint_info: if not checkpoint_info:
try: try:
checkpoint_info = self._get_best_checkpoint_info(model_name) cp_info = self._get_best_checkpoint_info(model_name)
if checkpoint_info: if cp_info:
checkpoint_info = cp_info
checkpoint_info['source'] = 'disk' checkpoint_info['source'] = 'disk'
except Exception as e: except Exception as e:
logger.error(f"Error reading checkpoint for {model_name}: {e}") logger.warning(f"Could not read checkpoint for {model_name}: {e}")
# Continue without checkpoint info # Continue without checkpoint info - not critical
model_states.append({ model_states.append({
'name': model_name, 'name': model_name,

View File

@@ -136,20 +136,37 @@
const isLoaded = (model && typeof model === 'object' && 'loaded' in model) ? model.loaded : false; const isLoaded = (model && typeof model === 'object' && 'loaded' in model) ? model.loaded : false;
const checkpoint = (model && typeof model === 'object' && model.checkpoint) ? model.checkpoint : null; const checkpoint = (model && typeof model === 'object' && model.checkpoint) ? model.checkpoint : null;
console.log(` → Name: "${modelName}", Loaded: ${isLoaded}`, checkpoint ? `Checkpoint: epoch ${checkpoint.epoch}, loss ${checkpoint.loss.toFixed(4)}` : ''); console.log(` → Name: "${modelName}", Loaded: ${isLoaded}`, checkpoint ? `Checkpoint: epoch ${checkpoint.epoch}` : '');
const option = document.createElement('option'); const option = document.createElement('option');
option.value = modelName; option.value = modelName;
// Build option text with checkpoint info // Build option text with checkpoint info (simplified for safety)
let optionText = modelName; let optionText = modelName;
try {
if (isLoaded) { if (isLoaded) {
optionText += ' ✓'; optionText += ' ✓';
if (checkpoint) { if (checkpoint && checkpoint.epoch) {
// Show full metrics if available (from loaded model)
if (checkpoint.loss != null && checkpoint.accuracy != null) {
optionText += ` (E${checkpoint.epoch}, L:${checkpoint.loss.toFixed(3)}, A:${(checkpoint.accuracy * 100).toFixed(1)}%)`; optionText += ` (E${checkpoint.epoch}, L:${checkpoint.loss.toFixed(3)}, A:${(checkpoint.accuracy * 100).toFixed(1)}%)`;
} else {
// Show just epoch if metrics not available (from filename)
optionText += ` (E${checkpoint.epoch})`;
}
} }
} else { } else {
optionText += ' (not loaded)'; optionText += ' (not loaded)';
// Optionally show checkpoint exists
if (checkpoint && checkpoint.epoch) {
optionText += ` [E${checkpoint.epoch}]`;
}
}
} catch (e) {
console.error('Error building option text:', e);
// Fallback to simple text
optionText = modelName + (isLoaded ? ' ✓' : ' (not loaded)');
} }
option.textContent = optionText; option.textContent = optionText;