BIG CLEANUP

2025-08-08 14:58:55 +03:00
parent e39e9ee95a
commit 2b0d2679c6
162 changed files with 455 additions and 42814 deletions
--- a/utils/init.py
+++ b/utils/init.py
@@ -1,3 +0,0 @@
-"""
-Utils package for the multi-modal trading system
-"""
--- a/utils/async_task_manager.py
+++ b/utils/async_task_manager.py
@@ -1,232 +0,0 @@
-"""
-Async Task Manager - Handles async tasks with comprehensive error handling
-Prevents silent failures in async operations
-"""
-
-import asyncio
-import logging
-import functools
-import traceback
-from typing import Any, Callable, Optional, Dict, List
-from datetime import datetime
-
-logger = logging.getLogger(__name__)
-
-class AsyncTaskManager:
-    """Manage async tasks with error handling and monitoring"""
-    
-    def __init__(self):
-        self.active_tasks: Dict[str, asyncio.Task] = {}
-        self.completed_tasks: List[Dict[str, Any]] = []
-        self.failed_tasks: List[Dict[str, Any]] = []
-        self.max_history = 100
-        
-    def create_task_with_error_handling(self, 
-                                      coro: Any, 
-                                      name: str,
-                                      error_callback: Optional[Callable] = None,
-                                      success_callback: Optional[Callable] = None) -> asyncio.Task:
-        """
-        Create an async task with comprehensive error handling
-        
-        Args:
-            coro: Coroutine to run
-            name: Task name for identification
-            error_callback: Called on error with (name, exception)
-            success_callback: Called on success with (name, result)
-        """
-        
-        async def wrapped_coro():
-            """Wrapper coroutine with error handling"""
-            start_time = datetime.now()
-            try:
-                logger.debug(f"Starting async task: {name}")
-                result = await coro
-                
-                # Log success
-                duration = (datetime.now() - start_time).total_seconds()
-                logger.debug(f"Async task '{name}' completed successfully in {duration:.2f}s")
-                
-                # Store completion info
-                completion_info = {
-                    'name': name,
-                    'status': 'completed',
-                    'start_time': start_time,
-                    'end_time': datetime.now(),
-                    'duration': duration,
-                    'result': str(result)[:200] if result else None  # Truncate long results
-                }
-                self.completed_tasks.append(completion_info)
-                
-                # Trim history
-                if len(self.completed_tasks) > self.max_history:
-                    self.completed_tasks.pop(0)
-                
-                # Call success callback
-                if success_callback:
-                    try:
-                        success_callback(name, result)
-                    except Exception as cb_error:
-                        logger.error(f"Error in success callback for task '{name}': {cb_error}")
-                
-                return result
-                
-            except asyncio.CancelledError:
-                logger.info(f"Async task '{name}' was cancelled")
-                raise
-                
-            except Exception as e:
-                # Log error with full traceback
-                duration = (datetime.now() - start_time).total_seconds()
-                error_msg = f"Async task '{name}' failed after {duration:.2f}s: {e}"
-                logger.error(error_msg)
-                logger.error(f"Task '{name}' traceback: {traceback.format_exc()}")
-                
-                # Store failure info
-                failure_info = {
-                    'name': name,
-                    'status': 'failed',
-                    'start_time': start_time,
-                    'end_time': datetime.now(),
-                    'duration': duration,
-                    'error': str(e),
-                    'traceback': traceback.format_exc()
-                }
-                self.failed_tasks.append(failure_info)
-                
-                # Trim history
-                if len(self.failed_tasks) > self.max_history:
-                    self.failed_tasks.pop(0)
-                
-                # Call error callback
-                if error_callback:
-                    try:
-                        error_callback(name, e)
-                    except Exception as cb_error:
-                        logger.error(f"Error in error callback for task '{name}': {cb_error}")
-                
-                # Don't re-raise to prevent task from crashing the event loop
-                # Instead, return None to indicate failure
-                return None
-            
-            finally:
-                # Remove from active tasks
-                if name in self.active_tasks:
-                    del self.active_tasks[name]
-        
-        # Create and store task
-        task = asyncio.create_task(wrapped_coro(), name=name)
-        self.active_tasks[name] = task
-        
-        return task
-    
-    def cancel_task(self, name: str) -> bool:
-        """Cancel a specific task"""
-        if name in self.active_tasks:
-            task = self.active_tasks[name]
-            if not task.done():
-                task.cancel()
-                logger.info(f"Cancelled async task: {name}")
-                return True
-        return False
-    
-    def cancel_all_tasks(self):
-        """Cancel all active tasks"""
-        for name, task in list(self.active_tasks.items()):
-            if not task.done():
-                task.cancel()
-                logger.info(f"Cancelled async task: {name}")
-    
-    def get_task_status(self) -> Dict[str, Any]:
-        """Get status of all tasks"""
-        active_count = len(self.active_tasks)
-        completed_count = len(self.completed_tasks)
-        failed_count = len(self.failed_tasks)
-        
-        # Get recent failures
-        recent_failures = self.failed_tasks[-5:] if self.failed_tasks else []
-        
-        return {
-            'active_tasks': active_count,
-            'completed_tasks': completed_count,
-            'failed_tasks': failed_count,
-            'active_task_names': list(self.active_tasks.keys()),
-            'recent_failures': [
-                {
-                    'name': f['name'],
-                    'error': f['error'],
-                    'duration': f['duration'],
-                    'time': f['end_time'].strftime('%H:%M:%S')
-                }
-                for f in recent_failures
-            ]
-        }
-    
-    def get_failure_summary(self) -> Dict[str, Any]:
-        """Get summary of task failures"""
-        if not self.failed_tasks:
-            return {'total_failures': 0, 'failure_patterns': {}}
-        
-        # Count failures by error type
-        error_counts = {}
-        for failure in self.failed_tasks:
-            error_type = type(failure.get('error', 'Unknown')).__name__
-            error_counts[error_type] = error_counts.get(error_type, 0) + 1
-        
-        # Recent failure rate
-        recent_failures = [f for f in self.failed_tasks if 
-                          (datetime.now() - f['end_time']).total_seconds() < 3600]  # Last hour
-        
-        return {
-            'total_failures': len(self.failed_tasks),
-            'recent_failures_1h': len(recent_failures),
-            'failure_patterns': error_counts,
-            'most_common_error': max(error_counts.items(), key=lambda x: x[1])[0] if error_counts else None
-        }
-
-# Global instance
-_task_manager = None
-
-def get_async_task_manager() -> AsyncTaskManager:
-    """Get global async task manager instance"""
-    global _task_manager
-    if _task_manager is None:
-        _task_manager = AsyncTaskManager()
-    return _task_manager
-
-def create_safe_task(coro: Any, 
-                    name: str,
-                    error_callback: Optional[Callable] = None,
-                    success_callback: Optional[Callable] = None) -> asyncio.Task:
-    """
-    Create a safe async task with error handling
-    
-    Args:
-        coro: Coroutine to run
-        name: Task name for identification
-        error_callback: Called on error with (name, exception)
-        success_callback: Called on success with (name, result)
-    """
-    manager = get_async_task_manager()
-    return manager.create_task_with_error_handling(coro, name, error_callback, success_callback)
-
-def safe_async_wrapper(name: str, 
-                      error_callback: Optional[Callable] = None,
-                      success_callback: Optional[Callable] = None):
-    """
-    Decorator for creating safe async functions
-    
-    Usage:
-        @safe_async_wrapper("my_task")
-        async def my_async_function():
-            # Your async code here
-            pass
-    """
-    def decorator(func):
-        @functools.wraps(func)
-        async def wrapper(*args, **kwargs):
-            coro = func(*args, **kwargs)
-            task = create_safe_task(coro, name, error_callback, success_callback)
-            return await task
-        return wrapper
-    return decorator
--- a/utils/launch_tensorboard.py
+++ b/utils/launch_tensorboard.py
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-"""
-TensorBoard Launcher with Automatic Port Management
-
-This script launches TensorBoard with automatic port fallback if the preferred port is in use.
-It also kills any stale debug instances that might be running.
-
-Usage:
-    python launch_tensorboard.py --logdir=path/to/logs --preferred-port=6007 --port-range=6000-7000
-"""
-
-import os
-import sys
-import subprocess
-import argparse
-import logging
-from pathlib import Path
-
-# Add project root to path
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if project_root not in sys.path:
-    sys.path.append(project_root)
-
-from utils.port_manager import get_port_with_fallback, kill_stale_debug_instances
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger('tensorboard_launcher')
-
-def launch_tensorboard(logdir, port, host='localhost', open_browser=True):
-    """
-    Launch TensorBoard on the specified port
-    
-    Args:
-        logdir (str): Path to log directory
-        port (int): Port to use
-        host (str): Host to bind to
-        open_browser (bool): Whether to open browser automatically
-    
-    Returns:
-        subprocess.Popen: Process object
-    """
-    cmd = [
-        sys.executable, "-m", "tensorboard.main",
-        f"--logdir={logdir}",
-        f"--port={port}",
-        f"--host={host}"
-    ]
-    
-    # Add --load_fast=false to improve startup times
-    cmd.append("--load_fast=false")
-    
-    # Control whether to open browser
-    if not open_browser:
-        cmd.append("--window_title=TensorBoard")
-    
-    logger.info(f"Launching TensorBoard: {' '.join(cmd)}")
-    
-    # Use subprocess.Popen to start TensorBoard without waiting for it to finish
-    process = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        universal_newlines=True,
-        bufsize=1
-    )
-    
-    # Log the first few lines of output to confirm it's starting correctly
-    line_count = 0
-    for line in process.stdout:
-        logger.info(f"TensorBoard: {line.strip()}")
-        line_count += 1
-        
-        # Check if TensorBoard has started successfully
-        if "TensorBoard" in line and "http://" in line:
-            url = line.strip().split("http://")[1].split(" ")[0]
-            logger.info(f"TensorBoard available at: http://{url}")
-            
-        # Only log the first few lines
-        if line_count >= 10:
-            break
-            
-    # Continue reading output in background to prevent pipe from filling
-    def read_output():
-        for line in process.stdout:
-            pass
-    
-    import threading
-    threading.Thread(target=read_output, daemon=True).start()
-            
-    return process
-
-def main():
-    parser = argparse.ArgumentParser(description='Launch TensorBoard with automatic port management')
-    parser.add_argument('--logdir', type=str, default='NN/models/saved/logs', 
-                       help='Directory containing TensorBoard event files')
-    parser.add_argument('--preferred-port', type=int, default=6007,
-                       help='Preferred port to use')
-    parser.add_argument('--port-range', type=str, default='6000-7000',
-                       help='Port range to try if preferred port is unavailable (format: min-max)')
-    parser.add_argument('--host', type=str, default='localhost',
-                       help='Host to bind to')
-    parser.add_argument('--no-browser', action='store_true',
-                       help='Do not open browser automatically')
-    parser.add_argument('--kill-stale', action='store_true',
-                       help='Kill stale debug instances before starting')
-    
-    args = parser.parse_args()
-    
-    # Parse port range
-    try:
-        min_port, max_port = map(int, args.port_range.split('-'))
-    except ValueError:
-        logger.error(f"Invalid port range format: {args.port_range}. Use format: min-max")
-        return 1
-    
-    # Kill stale instances if requested
-    if args.kill_stale:
-        logger.info("Killing stale debug instances...")
-        count, _ = kill_stale_debug_instances()
-        logger.info(f"Killed {count} stale instances")
-    
-    # Get an available port
-    try:
-        port = get_port_with_fallback(args.preferred_port, min_port, max_port)
-        logger.info(f"Using port {port} for TensorBoard")
-    except RuntimeError as e:
-        logger.error(str(e))
-        return 1
-    
-    # Ensure log directory exists
-    logdir = os.path.abspath(args.logdir)
-    os.makedirs(logdir, exist_ok=True)
-    
-    # Launch TensorBoard
-    process = launch_tensorboard(
-        logdir=logdir,
-        port=port,
-        host=args.host,
-        open_browser=not args.no_browser
-    )
-    
-    # Wait for process to end (it shouldn't unless there's an error or user kills it)
-    try:
-        return_code = process.wait()
-        if return_code != 0:
-            logger.error(f"TensorBoard exited with code {return_code}")
-            return return_code
-    except KeyboardInterrupt:
-        logger.info("Received keyboard interrupt, shutting down TensorBoard...")
-        process.terminate()
-        try:
-            process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            logger.warning("TensorBoard didn't terminate gracefully, forcing kill")
-            process.kill()
-    
-    return 0
-
-if __name__ == "__main__":
-    sys.exit(main()) 
--- a/utils/model_utils.py
+++ b/utils/model_utils.py
@@ -1,241 +0,0 @@
-#!/usr/bin/env python
-"""
-Model utilities for robust saving and loading of PyTorch models
-"""
-
-import os
-import logging
-import torch
-import shutil
-import gc
-import json
-from typing import Any, Dict, Optional, Union
-
-logger = logging.getLogger(__name__)
-
-def robust_save(model: Any, path: str, include_optimizer: bool = True) -> bool:
-    """
-    Robust model saving with multiple fallback approaches
-    
-    Args:
-        model: The model object to save (should have policy_net, target_net, optimizer, epsilon attributes)
-        path: Path to save the model
-        include_optimizer: Whether to include optimizer state in the save
-    
-    Returns:
-        bool: True if successful, False otherwise
-    """
-    # Create directory if it doesn't exist
-    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
-    
-    # Backup path in case the main save fails
-    backup_path = f"{path}.backup"
-    
-    # Clean up GPU memory before saving
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()
-    
-    # Prepare checkpoint data
-    checkpoint = {
-        'policy_net': model.policy_net.state_dict(),
-        'target_net': model.target_net.state_dict(),
-        'epsilon': getattr(model, 'epsilon', 0.0),
-        'state_size': getattr(model, 'state_size', None),
-        'action_size': getattr(model, 'action_size', None),
-        'hidden_size': getattr(model, 'hidden_size', None),
-    }
-    
-    # Add optimizer state if requested and available
-    if include_optimizer and hasattr(model, 'optimizer') and model.optimizer is not None:
-        checkpoint['optimizer'] = model.optimizer.state_dict()
-    
-    # Attempt 1: Try with default settings in a separate file first
-    try:
-        logger.info(f"Saving model to {backup_path} (attempt 1)")
-        torch.save(checkpoint, backup_path)
-        logger.info(f"Successfully saved to {backup_path}")
-        
-        # If backup worked, copy to the actual path
-        if os.path.exists(backup_path):
-            shutil.copy(backup_path, path)
-            logger.info(f"Copied backup to {path}")
-            return True
-    except Exception as e:
-        logger.warning(f"First save attempt failed: {e}")
-    
-    # Attempt 2: Try with pickle protocol 2 (more compatible)
-    try:
-        logger.info(f"Saving model to {path} (attempt 2 - pickle protocol 2)")
-        torch.save(checkpoint, path, pickle_protocol=2)
-        logger.info(f"Successfully saved to {path} with pickle_protocol=2")
-        return True
-    except Exception as e:
-        logger.warning(f"Second save attempt failed: {e}")
-    
-    # Attempt 3: Try without optimizer state (which can be large and cause issues)
-    try:
-        logger.info(f"Saving model to {path} (attempt 3 - without optimizer)")
-        checkpoint_no_opt = {k: v for k, v in checkpoint.items() if k != 'optimizer'}
-        torch.save(checkpoint_no_opt, path)
-        logger.info(f"Successfully saved to {path} without optimizer state")
-        return True
-    except Exception as e:
-        logger.warning(f"Third save attempt failed: {e}")
-    
-    # Attempt 4: Try with torch.jit.save instead
-    try:
-        logger.info(f"Saving model to {path} (attempt 4 - with jit.save)")
-        # Save policy network using jit
-        scripted_policy = torch.jit.script(model.policy_net)
-        torch.jit.save(scripted_policy, f"{path}.policy.jit")
-        
-        # Save target network using jit
-        scripted_target = torch.jit.script(model.target_net)
-        torch.jit.save(scripted_target, f"{path}.target.jit")
-        
-        # Save parameters separately as JSON
-        params = {
-            'epsilon': float(getattr(model, 'epsilon', 0.0)),
-            'state_size': int(getattr(model, 'state_size', 0)),
-            'action_size': int(getattr(model, 'action_size', 0)),
-            'hidden_size': int(getattr(model, 'hidden_size', 0))
-        }
-        with open(f"{path}.params.json", "w") as f:
-            json.dump(params, f)
-        
-        logger.info(f"Successfully saved model components with jit.save")
-        return True
-    except Exception as e:
-        logger.error(f"All save attempts failed: {e}")
-        return False
-
-def robust_load(model: Any, path: str, device: Optional[torch.device] = None) -> bool:
-    """
-    Robust model loading with fallback approaches
-    
-    Args:
-        model: The model object to load into
-        path: Path to load the model from
-        device: Device to load the model on
-    
-    Returns:
-        bool: True if successful, False otherwise
-    """
-    if device is None:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    
-    # Try regular PyTorch load first
-    try:
-        logger.info(f"Loading model from {path}")
-        if os.path.exists(path):
-            checkpoint = torch.load(path, map_location=device)
-            
-            # Load network states
-            if 'policy_net' in checkpoint:
-                model.policy_net.load_state_dict(checkpoint['policy_net'])
-            if 'target_net' in checkpoint:
-                model.target_net.load_state_dict(checkpoint['target_net'])
-            
-            # Load other attributes
-            if 'epsilon' in checkpoint:
-                model.epsilon = checkpoint['epsilon']
-            if 'optimizer' in checkpoint and hasattr(model, 'optimizer') and model.optimizer is not None:
-                try:
-                    model.optimizer.load_state_dict(checkpoint['optimizer'])
-                except Exception as e:
-                    logger.warning(f"Failed to load optimizer state: {e}")
-            
-            logger.info("Successfully loaded model")
-            return True
-    except Exception as e:
-        logger.warning(f"Regular load failed: {e}")
-    
-    # Try loading JIT saved components
-    try:
-        policy_path = f"{path}.policy.jit"
-        target_path = f"{path}.target.jit"
-        params_path = f"{path}.params.json"
-        
-        if all(os.path.exists(p) for p in [policy_path, target_path, params_path]):
-            logger.info(f"Loading JIT model components")
-            
-            # Load JIT models (this is more complex and may need model reconstruction)
-            # For now, just log that we found JIT files
-            logger.info("Found JIT model files, but loading them requires special handling")
-            with open(params_path, 'r') as f:
-                params = json.load(f)
-            logger.info(f"Model parameters: {params}")
-            
-            # Note: Actually loading JIT models would require recreating the model architecture
-            # This is a placeholder for future implementation
-            return False
-    except Exception as e:
-        logger.error(f"JIT load failed: {e}")
-    
-    logger.error(f"All load attempts failed for {path}")
-    return False
-
-def get_model_info(path: str) -> Dict[str, Any]:
-    """
-    Get information about a saved model
-    
-    Args:
-        path: Path to the model file
-    
-    Returns:
-        dict: Model information
-    """
-    info = {
-        'exists': False,
-        'size_bytes': 0,
-        'has_optimizer': False,
-        'parameters': {}
-    }
-    
-    try:
-        if os.path.exists(path):
-            info['exists'] = True
-            info['size_bytes'] = os.path.getsize(path)
-            
-            # Try to load and inspect
-            checkpoint = torch.load(path, map_location='cpu')
-            info['has_optimizer'] = 'optimizer' in checkpoint
-            
-            # Extract parameter info
-            for key in ['epsilon', 'state_size', 'action_size', 'hidden_size']:
-                if key in checkpoint:
-                    info['parameters'][key] = checkpoint[key]
-    except Exception as e:
-        logger.warning(f"Failed to get model info for {path}: {e}")
-    
-    return info
-
-def verify_save_load_cycle(model: Any, test_path: str) -> bool:
-    """
-    Test that a model can be saved and loaded correctly
-    
-    Args:
-        model: Model to test
-        test_path: Path for test file
-    
-    Returns:
-        bool: True if save/load cycle successful
-    """
-    try:
-        # Save the model
-        if not robust_save(model, test_path):
-            return False
-        
-        # Create a new model instance (this would need model creation logic)
-        # For now, just verify the file exists and has content
-        if os.path.exists(test_path) and os.path.getsize(test_path) > 0:
-            logger.info("Save/load cycle verification successful")
-            # Clean up test file
-            os.remove(test_path)
-            return True
-        else:
-            return False
-    except Exception as e:
-        logger.error(f"Save/load cycle verification failed: {e}")
-        return False 
--- a/utils/port_manager.py
+++ b/utils/port_manager.py
@@ -1,238 +0,0 @@
-#!/usr/bin/env python3
-"""
-Port Management Utility
-
-This script provides utilities to:
-1. Find available ports in a specified range
-2. Kill stale processes running on specific ports
-3. Kill all debug/training instances
-
-Usage:
-    - As a module: import port_manager and use its functions
-    - Directly: python port_manager.py --kill-stale --min-port 6000 --max-port 7000
-"""
-
-import os
-import sys
-import socket
-import argparse
-import psutil
-import logging
-import time
-import signal
-from typing import List, Tuple, Optional, Set
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger('port_manager')
-
-# Define process names to look for when killing stale instances
-DEBUG_PROCESS_KEYWORDS = [
-    'tensorboard', 
-    'python train_', 
-    'realtime.py',
-    'train_rl_with_realtime.py'
-]
-
-def is_port_in_use(port: int) -> bool:
-    """
-    Check if a port is in use
-    
-    Args:
-        port (int): Port number to check
-        
-    Returns:
-        bool: True if port is in use, False otherwise
-    """
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        return s.connect_ex(('localhost', port)) == 0
-
-def find_available_port(start_port: int, end_port: int) -> Optional[int]:
-    """
-    Find an available port in the specified range
-    
-    Args:
-        start_port (int): Lower bound of port range
-        end_port (int): Upper bound of port range
-        
-    Returns:
-        Optional[int]: Available port number or None if no ports available
-    """
-    for port in range(start_port, end_port + 1):
-        if not is_port_in_use(port):
-            return port
-    return None
-
-def get_process_by_port(port: int) -> List[psutil.Process]:
-    """
-    Get processes using a specific port
-    
-    Args:
-        port (int): Port number to check
-        
-    Returns:
-        List[psutil.Process]: List of processes using the port
-    """
-    processes = []
-    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
-        try:
-            for conn in proc.connections(kind='inet'):
-                if conn.laddr.port == port:
-                    processes.append(proc)
-        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
-            pass
-    return processes
-
-def kill_process_by_port(port: int) -> Tuple[int, List[str]]:
-    """
-    Kill processes using a specific port
-    
-    Args:
-        port (int): Port number to check
-        
-    Returns:
-        Tuple[int, List[str]]: Count of killed processes and their names
-    """
-    processes = get_process_by_port(port)
-    killed = []
-    
-    for proc in processes:
-        try:
-            proc_name = " ".join(proc.cmdline()) if proc.cmdline() else proc.name()
-            logger.info(f"Terminating process {proc.pid}: {proc_name}")
-            proc.terminate()
-            killed.append(proc_name)
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            pass
-            
-    # Give processes time to terminate gracefully
-    if processes:
-        time.sleep(0.5)
-        
-    # Force kill any remaining processes
-    for proc in processes:
-        try:
-            if proc.is_running():
-                logger.info(f"Force killing process {proc.pid}")
-                proc.kill()
-        except (psutil.NoSuchProcess, psutil.AccessDenied):
-            pass
-            
-    return len(killed), killed
-
-def kill_stale_debug_instances() -> Tuple[int, Set[str]]:
-    """
-    Kill all stale debug and training instances based on process names
-    
-    Returns:
-        Tuple[int, Set[str]]: Count of killed processes and their names
-    """
-    killed_count = 0
-    killed_procs = set()
-    
-    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
-        try:
-            cmd = " ".join(proc.cmdline()) if proc.cmdline() else proc.name()
-            
-            # Check if this is a debug/training process we should kill
-            if any(keyword in cmd for keyword in DEBUG_PROCESS_KEYWORDS):
-                logger.info(f"Terminating stale process {proc.pid}: {cmd}")
-                proc.terminate()
-                killed_count += 1
-                killed_procs.add(cmd)
-        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
-            pass
-            
-    # Give processes time to terminate
-    if killed_count > 0:
-        time.sleep(1)
-        
-    # Force kill any remaining processes
-    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
-        try:
-            cmd = " ".join(proc.cmdline()) if proc.cmdline() else proc.name()
-            
-            if any(keyword in cmd for keyword in DEBUG_PROCESS_KEYWORDS) and proc.is_running():
-                logger.info(f"Force killing stale process {proc.pid}")
-                proc.kill()
-        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
-            pass
-            
-    return killed_count, killed_procs
-
-def get_port_with_fallback(preferred_port: int, min_port: int, max_port: int) -> int:
-    """
-    Try to use preferred port, fall back to any available port in range
-    
-    Args:
-        preferred_port (int): Preferred port to use
-        min_port (int): Minimum port in fallback range
-        max_port (int): Maximum port in fallback range
-        
-    Returns:
-        int: Available port number
-    """
-    # First try the preferred port
-    if not is_port_in_use(preferred_port):
-        return preferred_port
-        
-    # If preferred port is in use, try to free it
-    logger.info(f"Preferred port {preferred_port} is in use, attempting to free it")
-    kill_count, _ = kill_process_by_port(preferred_port)
-    
-    if kill_count > 0 and not is_port_in_use(preferred_port):
-        logger.info(f"Successfully freed port {preferred_port}")
-        return preferred_port
-        
-    # If we couldn't free the preferred port, find another available port
-    logger.info(f"Looking for available port in range {min_port}-{max_port}")
-    available_port = find_available_port(min_port, max_port)
-    
-    if available_port:
-        logger.info(f"Using alternative port: {available_port}")
-        return available_port
-    else:
-        # If no ports are available, force kill processes in the entire range
-        logger.warning(f"No available ports in range {min_port}-{max_port}, freeing ports")
-        for port in range(min_port, max_port + 1):
-            kill_process_by_port(port)
-            
-        # Try again
-        available_port = find_available_port(min_port, max_port)
-        if available_port:
-            logger.info(f"Using port {available_port} after freeing")
-            return available_port
-        else:
-            logger.error(f"Could not find available port even after freeing range {min_port}-{max_port}")
-            raise RuntimeError(f"No available ports in range {min_port}-{max_port}")
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Port management utility')
-    parser.add_argument('--kill-stale', action='store_true', help='Kill all stale debug instances')
-    parser.add_argument('--free-port', type=int, help='Free a specific port')
-    parser.add_argument('--find-port', action='store_true', help='Find an available port')
-    parser.add_argument('--min-port', type=int, default=6000, help='Minimum port in range')
-    parser.add_argument('--max-port', type=int, default=7000, help='Maximum port in range')
-    parser.add_argument('--preferred-port', type=int, help='Preferred port to use')
-    
-    args = parser.parse_args()
-    
-    if args.kill_stale:
-        count, procs = kill_stale_debug_instances()
-        logger.info(f"Killed {count} stale processes")
-        for proc in procs:
-            logger.info(f"  - {proc}")
-    
-    if args.free_port:
-        count, killed = kill_process_by_port(args.free_port)
-        logger.info(f"Killed {count} processes using port {args.free_port}")
-        for proc in killed:
-            logger.info(f"  - {proc}")
-    
-    if args.find_port or args.preferred_port:
-        preferred = args.preferred_port if args.preferred_port else args.min_port
-        port = get_port_with_fallback(preferred, args.min_port, args.max_port)
-        print(port)  # Print only the port number for easy capture in scripts 
--- a/utils/process_supervisor.py
+++ b/utils/process_supervisor.py
@@ -1,340 +0,0 @@
-"""
-Process Supervisor - Handles process monitoring, restarts, and supervision
-Prevents silent failures by monitoring process health and restarting on crashes
-"""
-
-import subprocess
-import threading
-import time
-import logging
-import signal
-import os
-import sys
-from typing import Dict, Any, Optional, Callable, List
-from datetime import datetime, timedelta
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-class ProcessSupervisor:
-    """Supervise processes and restart them on failure"""
-    
-    def __init__(self, max_restarts: int = 5, restart_delay: int = 10):
-        """
-        Initialize process supervisor
-        
-        Args:
-            max_restarts: Maximum number of restarts before giving up
-            restart_delay: Delay in seconds between restarts
-        """
-        self.max_restarts = max_restarts
-        self.restart_delay = restart_delay
-        
-        self.processes: Dict[str, Dict[str, Any]] = {}
-        self.monitoring = False
-        self.monitor_thread = None
-        
-        # Callbacks
-        self.process_started_callback: Optional[Callable] = None
-        self.process_failed_callback: Optional[Callable] = None
-        self.process_restarted_callback: Optional[Callable] = None
-        
-    def add_process(self, name: str, command: List[str], 
-                   working_dir: Optional[str] = None,
-                   env: Optional[Dict[str, str]] = None,
-                   auto_restart: bool = True):
-        """
-        Add a process to supervise
-        
-        Args:
-            name: Process name
-            command: Command to run as list
-            working_dir: Working directory
-            env: Environment variables
-            auto_restart: Whether to auto-restart on failure
-        """
-        self.processes[name] = {
-            'command': command,
-            'working_dir': working_dir,
-            'env': env,
-            'auto_restart': auto_restart,
-            'process': None,
-            'restart_count': 0,
-            'last_start': None,
-            'last_failure': None,
-            'status': 'stopped'
-        }
-        logger.info(f"Added process '{name}' to supervisor")
-    
-    def start_process(self, name: str) -> bool:
-        """Start a specific process"""
-        if name not in self.processes:
-            logger.error(f"Process '{name}' not found")
-            return False
-        
-        proc_info = self.processes[name]
-        
-        if proc_info['process'] and proc_info['process'].poll() is None:
-            logger.warning(f"Process '{name}' is already running")
-            return True
-        
-        try:
-            # Prepare environment
-            env = os.environ.copy()
-            if proc_info['env']:
-                env.update(proc_info['env'])
-            
-            # Start process
-            process = subprocess.Popen(
-                proc_info['command'],
-                cwd=proc_info['working_dir'],
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True
-            )
-            
-            proc_info['process'] = process
-            proc_info['last_start'] = datetime.now()
-            proc_info['status'] = 'running'
-            
-            logger.info(f"Started process '{name}' (PID: {process.pid})")
-            
-            if self.process_started_callback:
-                try:
-                    self.process_started_callback(name, process.pid)
-                except Exception as e:
-                    logger.error(f"Error in process started callback: {e}")
-            
-            return True
-            
-        except Exception as e:
-            logger.error(f"Failed to start process '{name}': {e}")
-            proc_info['status'] = 'failed'
-            proc_info['last_failure'] = datetime.now()
-            return False
-    
-    def stop_process(self, name: str, timeout: int = 10) -> bool:
-        """Stop a specific process"""
-        if name not in self.processes:
-            logger.error(f"Process '{name}' not found")
-            return False
-        
-        proc_info = self.processes[name]
-        process = proc_info['process']
-        
-        if not process or process.poll() is not None:
-            logger.info(f"Process '{name}' is not running")
-            proc_info['status'] = 'stopped'
-            return True
-        
-        try:
-            # Try graceful shutdown first
-            process.terminate()
-            
-            # Wait for graceful shutdown
-            try:
-                process.wait(timeout=timeout)
-                logger.info(f"Process '{name}' terminated gracefully")
-            except subprocess.TimeoutExpired:
-                # Force kill if graceful shutdown fails
-                logger.warning(f"Process '{name}' did not terminate gracefully, force killing")
-                process.kill()
-                process.wait()
-                logger.info(f"Process '{name}' force killed")
-            
-            proc_info['status'] = 'stopped'
-            return True
-            
-        except Exception as e:
-            logger.error(f"Error stopping process '{name}': {e}")
-            return False
-    
-    def restart_process(self, name: str) -> bool:
-        """Restart a specific process"""
-        logger.info(f"Restarting process '{name}'")
-        
-        if name not in self.processes:
-            logger.error(f"Process '{name}' not found")
-            return False
-        
-        proc_info = self.processes[name]
-        
-        # Stop if running
-        if proc_info['process'] and proc_info['process'].poll() is None:
-            self.stop_process(name)
-        
-        # Wait restart delay
-        time.sleep(self.restart_delay)
-        
-        # Increment restart count
-        proc_info['restart_count'] += 1
-        
-        # Check restart limit
-        if proc_info['restart_count'] > self.max_restarts:
-            logger.error(f"Process '{name}' exceeded max restarts ({self.max_restarts})")
-            proc_info['status'] = 'failed_max_restarts'
-            return False
-        
-        # Start process
-        success = self.start_process(name)
-        
-        if success and self.process_restarted_callback:
-            try:
-                self.process_restarted_callback(name, proc_info['restart_count'])
-            except Exception as e:
-                logger.error(f"Error in process restarted callback: {e}")
-        
-        return success
-    
-    def start_monitoring(self):
-        """Start process monitoring"""
-        if self.monitoring:
-            logger.warning("Process monitoring already started")
-            return
-        
-        self.monitoring = True
-        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
-        self.monitor_thread.start()
-        logger.info("Process monitoring started")
-    
-    def stop_monitoring(self):
-        """Stop process monitoring"""
-        self.monitoring = False
-        if self.monitor_thread:
-            self.monitor_thread.join(timeout=5)
-        logger.info("Process monitoring stopped")
-    
-    def _monitor_loop(self):
-        """Main monitoring loop"""
-        logger.info("Process monitoring loop started")
-        
-        while self.monitoring:
-            try:
-                for name, proc_info in self.processes.items():
-                    self._check_process_health(name, proc_info)
-                
-                time.sleep(5)  # Check every 5 seconds
-                
-            except Exception as e:
-                logger.error(f"Error in process monitoring loop: {e}")
-                time.sleep(5)
-        
-        logger.info("Process monitoring loop stopped")
-    
-    def _check_process_health(self, name: str, proc_info: Dict[str, Any]):
-        """Check health of a specific process"""
-        process = proc_info['process']
-        
-        if not process:
-            return
-        
-        # Check if process is still running
-        return_code = process.poll()
-        
-        if return_code is not None:
-            # Process has exited
-            proc_info['status'] = 'exited'
-            proc_info['last_failure'] = datetime.now()
-            
-            logger.warning(f"Process '{name}' exited with code {return_code}")
-            
-            # Read stdout/stderr for debugging
-            try:
-                stdout, stderr = process.communicate(timeout=1)
-                if stdout:
-                    logger.info(f"Process '{name}' stdout: {stdout[-500:]}")  # Last 500 chars
-                if stderr:
-                    logger.error(f"Process '{name}' stderr: {stderr[-500:]}")  # Last 500 chars
-            except Exception as e:
-                logger.warning(f"Could not read process output: {e}")
-            
-            if self.process_failed_callback:
-                try:
-                    self.process_failed_callback(name, return_code)
-                except Exception as e:
-                    logger.error(f"Error in process failed callback: {e}")
-            
-            # Auto-restart if enabled
-            if proc_info['auto_restart'] and proc_info['restart_count'] < self.max_restarts:
-                logger.info(f"Auto-restarting process '{name}'")
-                threading.Thread(target=self.restart_process, args=(name,), daemon=True).start()
-    
-    def get_process_status(self, name: str) -> Optional[Dict[str, Any]]:
-        """Get status of a specific process"""
-        if name not in self.processes:
-            return None
-        
-        proc_info = self.processes[name]
-        process = proc_info['process']
-        
-        status = {
-            'name': name,
-            'status': proc_info['status'],
-            'restart_count': proc_info['restart_count'],
-            'last_start': proc_info['last_start'],
-            'last_failure': proc_info['last_failure'],
-            'auto_restart': proc_info['auto_restart'],
-            'pid': process.pid if process and process.poll() is None else None,
-            'running': process is not None and process.poll() is None
-        }
-        
-        return status
-    
-    def get_all_status(self) -> Dict[str, Dict[str, Any]]:
-        """Get status of all processes"""
-        return {name: self.get_process_status(name) for name in self.processes}
-    
-    def set_callbacks(self,
-                     process_started: Optional[Callable] = None,
-                     process_failed: Optional[Callable] = None,
-                     process_restarted: Optional[Callable] = None):
-        """Set callback functions for process events"""
-        self.process_started_callback = process_started
-        self.process_failed_callback = process_failed
-        self.process_restarted_callback = process_restarted
-    
-    def shutdown_all(self):
-        """Shutdown all processes"""
-        logger.info("Shutting down all supervised processes")
-        
-        for name in list(self.processes.keys()):
-            self.stop_process(name)
-        
-        self.stop_monitoring()
-
-# Global instance
-_process_supervisor = None
-
-def get_process_supervisor() -> ProcessSupervisor:
-    """Get global process supervisor instance"""
-    global _process_supervisor
-    if _process_supervisor is None:
-        _process_supervisor = ProcessSupervisor()
-    return _process_supervisor
-
-def create_supervised_dashboard_runner():
-    """Create a supervised version of the dashboard runner"""
-    supervisor = get_process_supervisor()
-    
-    # Add dashboard process
-    supervisor.add_process(
-        name="clean_dashboard",
-        command=[sys.executable, "run_clean_dashboard.py"],
-        working_dir=os.getcwd(),
-        auto_restart=True
-    )
-    
-    # Set up callbacks
-    def on_process_failed(name: str, return_code: int):
-        logger.error(f"Dashboard process failed with code {return_code}")
-    
-    def on_process_restarted(name: str, restart_count: int):
-        logger.info(f"Dashboard restarted (attempt {restart_count})")
-    
-    supervisor.set_callbacks(
-        process_failed=on_process_failed,
-        process_restarted=on_process_restarted
-    )
-    
-    return supervisor
--- a/utils/reward_calculator.py
+++ b/utils/reward_calculator.py
@@ -1,220 +0,0 @@
-"""
-Improved Reward Function for RL Trading Agent
-
-This module provides a more sophisticated reward function for the RL trading agent
-that incorporates realistic trading fees, penalties for excessive trading, and
-rewards for successful holding of positions.
-"""
-
-import numpy as np
-from datetime import datetime, timedelta
-from collections import deque
-import logging
-
-logger = logging.getLogger(__name__)
-
-class RewardCalculator:
-    def __init__(self, base_fee_rate=0.001, reward_scaling=10.0, risk_aversion=0.1):
-        self.base_fee_rate = base_fee_rate
-        self.reward_scaling = reward_scaling
-        self.risk_aversion = risk_aversion
-        self.trade_pnls = []
-        self.returns = []
-        self.trade_timestamps = []
-        self.frequency_threshold = 10  # Trades per minute threshold for penalty
-        self.max_frequency_penalty = 0.05
-    
-    def record_pnl(self, pnl):
-        """Record P&L for risk adjustment calculations"""
-        self.trade_pnls.append(pnl)
-        if len(self.trade_pnls) > 100:
-            self.trade_pnls.pop(0)
-
-    def record_trade(self, action):
-        """Record trade action for frequency penalty calculations"""
-        from time import time
-        self.trade_timestamps.append(time())
-        if len(self.trade_timestamps) > 100:
-            self.trade_timestamps.pop(0)
-    
-    def _calculate_frequency_penalty(self):
-        """Calculate penalty for high-frequency trading"""
-        if len(self.trade_timestamps) < 2:
-            return 0.0
-        time_span = self.trade_timestamps[-1] - self.trade_timestamps[0]
-        if time_span <= 0:
-            return 0.0
-        trades_per_minute = (len(self.trade_timestamps) / time_span) * 60
-        if trades_per_minute > self.frequency_threshold:
-            penalty = min(self.max_frequency_penalty, (trades_per_minute - self.frequency_threshold) * 0.001)
-        return penalty
-        return 0.0
-    
-    def _calculate_risk_adjustment(self, reward):
-        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
-        if len(self.trade_pnls) < 5:
-            return reward
-        pnl_array = np.array(self.trade_pnls)
-        mean_return = np.mean(pnl_array)
-        std_return = np.std(pnl_array)
-        if std_return == 0:
-            return reward
-        sharpe = mean_return / std_return
-        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
-        return reward * adjustment_factor
-    
-    def _calculate_holding_reward(self, position_held_time, price_change):
-        """Calculate reward for holding a position"""
-        base_holding_reward = 0.0005 * (position_held_time / 60.0)
-        if price_change > 0:
-            return base_holding_reward * 2
-        elif price_change < 0:
-            return base_holding_reward * 0.5
-        return base_holding_reward
-
-    def calculate_basic_reward(self, pnl, confidence):
-        """Calculate basic training reward based on P&L and confidence"""
-        try:
-            base_reward = pnl
-            if pnl < 0 and confidence > 0.7:
-                confidence_adjustment = -confidence * 2
-            elif pnl > 0 and confidence > 0.7:
-                confidence_adjustment = confidence * 1.5
-            else:
-                confidence_adjustment = 0
-            final_reward = base_reward + confidence_adjustment
-            normalized_reward = np.tanh(final_reward / 10.0)
-            logger.debug(f"Basic reward calculation: P&L={pnl:.4f}, confidence={confidence:.2f}, reward={normalized_reward:.4f}")
-            return float(normalized_reward)
-        except Exception as e:
-            logger.error(f"Error calculating basic reward: {e}")
-            return 0.0
-
-    def calculate_enhanced_reward(self, action, price_change, position_held_time=0, volatility=None, is_profitable=False, confidence=0.0, predicted_change=0.0, actual_change=0.0, current_pnl=0.0, symbol='UNKNOWN'):
-        """Calculate enhanced reward for trading actions with shifted neutral point
-        
-        Neutral reward is shifted to require profits that exceed double the fees,
-        which penalizes small profit trades and encourages holding for larger moves.
-        Current PnL is given more weight in the decision-making process.
-        """
-        fee = self.base_fee_rate
-        double_fee = fee * 4  # Double the fees (2x open + 2x close = 4x base fee)
-        frequency_penalty = self._calculate_frequency_penalty()
-        
-        if action == 0:  # Buy
-            # Penalize buying more when already in profit
-            reward = -fee - frequency_penalty
-            if current_pnl > 0:
-                # Reduce incentive to close profitable positions
-                reward -= current_pnl * 0.2
-        elif action == 1:  # Sell
-            profit_pct = price_change
-            
-            # Shift neutral point - require profit > double fees to be considered positive
-            net_profit = profit_pct - double_fee
-            
-            # Scale reward based on profit size
-            if net_profit > 0:
-                # Exponential reward for larger profits
-                reward = (net_profit ** 1.5) * self.reward_scaling
-            else:
-                # Linear penalty for losses
-                reward = net_profit * self.reward_scaling
-                
-            reward -= frequency_penalty
-            self.record_pnl(net_profit)
-            
-            # Add extra penalty for very small profits (less than 3x fees)
-            if 0 < profit_pct < (fee * 6):
-                reward -= 0.5  # Discourage tiny profit-taking
-        else:  # Hold
-            if is_profitable:
-                # Increase reward for holding profitable positions
-                profit_factor = min(5.0, current_pnl * 20)  # Cap at 5x
-                reward = self._calculate_holding_reward(position_held_time, price_change) * (1.0 + profit_factor)
-                
-                # Add bonus for holding through volatility when profitable
-                if volatility is not None and volatility > 0.001:
-                    reward += 0.1 * volatility * 100
-            else:
-                # Small penalty for holding losing positions
-                loss_factor = min(1.0, abs(current_pnl) * 10)
-                reward = -0.0001 * (1.0 + loss_factor)
-                
-                # But reduce penalty for very recent positions (give them time)
-                if position_held_time < 30:  # Less than 30 seconds
-                    reward *= 0.5
-        
-        # Prediction accuracy reward component
-        if action in [0, 1] and predicted_change != 0:
-            if (action == 0 and actual_change > 0) or (action == 1 and actual_change < 0):
-                reward += abs(actual_change) * 5.0
-            else:
-                reward -= abs(predicted_change) * 2.0
-        
-        # Increase weight of current PnL in decision making (3x more than before)
-        reward += current_pnl * 0.3
-        
-        # Volatility penalty
-        if volatility is not None:
-            reward -= abs(volatility) * 100
-        
-        # Risk adjustment
-        if self.risk_aversion > 0 and len(self.returns) > 1:
-            returns_std = np.std(self.returns)
-            reward -= returns_std * self.risk_aversion
-        
-        self.record_trade(action)
-        return reward
-
-    def calculate_prediction_reward(self, symbol, predicted_direction, actual_direction, confidence, predicted_change, actual_change, current_pnl=0.0, position_duration=0.0):
-        """Calculate reward for prediction accuracy"""
-        reward = 0.0
-        if predicted_direction == actual_direction:
-            reward += 1.0 * confidence
-        else:
-            reward -= 0.5
-        if predicted_direction == actual_direction and abs(predicted_change) > 0.001:
-            reward += abs(actual_change) * 5.0
-        if predicted_direction != actual_direction and abs(predicted_change) > 0.001:
-            reward -= abs(predicted_change) * 2.0
-        reward += current_pnl * 0.1
-        # Dynamic adjustment based on recent PnL (loss cutting incentive)
-        if hasattr(self, 'pnl_history') and symbol in self.pnl_history and self.pnl_history[symbol]:
-            latest_pnl_entry = self.pnl_history[symbol][-1]
-            latest_pnl_value = latest_pnl_entry.get('pnl', 0.0) if isinstance(latest_pnl_entry, dict) else 0.0
-            if latest_pnl_value < 0 and position_duration > 60:
-                reward -= (abs(latest_pnl_value) * 0.2)
-            pnl_values = [entry.get('pnl', 0.0) for entry in self.pnl_history[symbol] if isinstance(entry, dict)]
-            best_pnl = max(pnl_values) if pnl_values else 0.0
-            if best_pnl < 0.0:
-                reward -= 0.1
-        return reward
-
-# Example usage:
-if __name__ == "__main__":
-    # Create calculator instance
-    reward_calc = RewardCalculator()
-    
-    # Example reward for a buy action
-    buy_reward = reward_calc.calculate_enhanced_reward(action=0, price_change=0)
-    print(f"Buy action reward: {buy_reward:.5f}")
-    
-    # Record a trade for frequency tracking
-    reward_calc.record_trade(0)
-    
-    # Wait a bit and make another trade to test frequency penalty
-    import time
-    time.sleep(0.1)
-    
-    # Example reward for a sell action with profit
-    sell_reward = reward_calc.calculate_enhanced_reward(action=1, price_change=0.015, position_held_time=60)
-    print(f"Sell action reward (with profit): {sell_reward:.5f}")
-    
-    # Example reward for a hold action on profitable position
-    hold_reward = reward_calc.calculate_enhanced_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
-    print(f"Hold action reward (profitable): {hold_reward:.5f}")
-    
-    # Example reward for a hold action on unprofitable position
-    hold_reward_neg = reward_calc.calculate_enhanced_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
-    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}") 
--- a/utils/system_monitor.py
+++ b/utils/system_monitor.py
@@ -1,288 +0,0 @@
-"""
-System Resource Monitor - Prevents resource exhaustion and silent failures
-Monitors memory, CPU, and disk usage to prevent system crashes
-"""
-
-import psutil
-import logging
-import threading
-import time
-import gc
-import os
-from typing import Dict, Any, Optional, Callable
-from datetime import datetime, timedelta
-
-logger = logging.getLogger(__name__)
-
-class SystemResourceMonitor:
-    """Monitor system resources and prevent exhaustion"""
-    
-    def __init__(self, 
-                 memory_threshold_mb: int = 7000,  # 7GB threshold for 8GB system
-                 cpu_threshold_percent: float = 90.0,
-                 disk_threshold_percent: float = 95.0,
-                 check_interval_seconds: int = 30):
-        """
-        Initialize system resource monitor
-        
-        Args:
-            memory_threshold_mb: Memory threshold in MB before cleanup
-            cpu_threshold_percent: CPU threshold percentage before warning
-            disk_threshold_percent: Disk usage threshold before warning
-            check_interval_seconds: How often to check resources
-        """
-        self.memory_threshold_mb = memory_threshold_mb
-        self.cpu_threshold_percent = cpu_threshold_percent
-        self.disk_threshold_percent = disk_threshold_percent
-        self.check_interval = check_interval_seconds
-        
-        self.monitoring = False
-        self.monitor_thread = None
-        
-        # Callbacks for resource events
-        self.memory_warning_callback: Optional[Callable] = None
-        self.cpu_warning_callback: Optional[Callable] = None
-        self.disk_warning_callback: Optional[Callable] = None
-        self.cleanup_callback: Optional[Callable] = None
-        
-        # Resource history for trending
-        self.resource_history = []
-        self.max_history_entries = 100
-        
-        # Last warning times to prevent spam
-        self.last_memory_warning = datetime.min
-        self.last_cpu_warning = datetime.min
-        self.last_disk_warning = datetime.min
-        self.warning_cooldown = timedelta(minutes=5)
-        
-    def start_monitoring(self):
-        """Start resource monitoring in background thread"""
-        if self.monitoring:
-            logger.warning("Resource monitoring already started")
-            return
-        
-        self.monitoring = True
-        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
-        self.monitor_thread.start()
-        logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
-    
-    def stop_monitoring(self):
-        """Stop resource monitoring"""
-        self.monitoring = False
-        if self.monitor_thread:
-            self.monitor_thread.join(timeout=5)
-        logger.info("System resource monitoring stopped")
-    
-    def set_callbacks(self, 
-                     memory_warning: Optional[Callable] = None,
-                     cpu_warning: Optional[Callable] = None,
-                     disk_warning: Optional[Callable] = None,
-                     cleanup: Optional[Callable] = None):
-        """Set callback functions for resource events"""
-        self.memory_warning_callback = memory_warning
-        self.cpu_warning_callback = cpu_warning
-        self.disk_warning_callback = disk_warning
-        self.cleanup_callback = cleanup
-    
-    def get_current_usage(self) -> Dict[str, Any]:
-        """Get current system resource usage"""
-        try:
-            # Memory usage
-            memory = psutil.virtual_memory()
-            memory_mb = memory.used / (1024 * 1024)
-            memory_percent = memory.percent
-            
-            # CPU usage
-            cpu_percent = psutil.cpu_percent(interval=1)
-            
-            # Disk usage (current directory)
-            disk = psutil.disk_usage('.')
-            disk_percent = (disk.used / disk.total) * 100
-            
-            # Process-specific info
-            process = psutil.Process()
-            process_memory_mb = process.memory_info().rss / (1024 * 1024)
-            
-            return {
-                'timestamp': datetime.now(),
-                'memory': {
-                    'total_mb': memory.total / (1024 * 1024),
-                    'used_mb': memory_mb,
-                    'percent': memory_percent,
-                    'available_mb': memory.available / (1024 * 1024)
-                },
-                'process_memory_mb': process_memory_mb,
-                'cpu_percent': cpu_percent,
-                'disk': {
-                    'total_gb': disk.total / (1024 * 1024 * 1024),
-                    'used_gb': disk.used / (1024 * 1024 * 1024),
-                    'percent': disk_percent
-                }
-            }
-        except Exception as e:
-            logger.error(f"Error getting system usage: {e}")
-            return {}
-    
-    def _monitor_loop(self):
-        """Main monitoring loop"""
-        logger.info("Resource monitoring loop started")
-        
-        while self.monitoring:
-            try:
-                usage = self.get_current_usage()
-                if not usage:
-                    time.sleep(self.check_interval)
-                    continue
-                
-                # Store in history
-                self.resource_history.append(usage)
-                if len(self.resource_history) > self.max_history_entries:
-                    self.resource_history.pop(0)
-                
-                # Check thresholds
-                self._check_memory_threshold(usage)
-                self._check_cpu_threshold(usage)
-                self._check_disk_threshold(usage)
-                
-                # Log periodic status (every 10 minutes)
-                if len(self.resource_history) % 20 == 0:  # 20 * 30s = 10 minutes
-                    self._log_resource_status(usage)
-                
-            except Exception as e:
-                logger.error(f"Error in resource monitoring loop: {e}")
-            
-            time.sleep(self.check_interval)
-        
-        logger.info("Resource monitoring loop stopped")
-    
-    def _check_memory_threshold(self, usage: Dict[str, Any]):
-        """Check memory usage threshold"""
-        memory_mb = usage.get('memory', {}).get('used_mb', 0)
-        
-        if memory_mb > self.memory_threshold_mb:
-            now = datetime.now()
-            if now - self.last_memory_warning > self.warning_cooldown:
-                logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
-                self.last_memory_warning = now
-                
-                # Trigger cleanup
-                self._trigger_memory_cleanup()
-                
-                # Call callback if set
-                if self.memory_warning_callback:
-                    try:
-                        self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
-                    except Exception as e:
-                        logger.error(f"Error in memory warning callback: {e}")
-    
-    def _check_cpu_threshold(self, usage: Dict[str, Any]):
-        """Check CPU usage threshold"""
-        cpu_percent = usage.get('cpu_percent', 0)
-        
-        if cpu_percent > self.cpu_threshold_percent:
-            now = datetime.now()
-            if now - self.last_cpu_warning > self.warning_cooldown:
-                logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
-                self.last_cpu_warning = now
-                
-                if self.cpu_warning_callback:
-                    try:
-                        self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
-                    except Exception as e:
-                        logger.error(f"Error in CPU warning callback: {e}")
-    
-    def _check_disk_threshold(self, usage: Dict[str, Any]):
-        """Check disk usage threshold"""
-        disk_percent = usage.get('disk', {}).get('percent', 0)
-        
-        if disk_percent > self.disk_threshold_percent:
-            now = datetime.now()
-            if now - self.last_disk_warning > self.warning_cooldown:
-                logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
-                self.last_disk_warning = now
-                
-                if self.disk_warning_callback:
-                    try:
-                        self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
-                    except Exception as e:
-                        logger.error(f"Error in disk warning callback: {e}")
-    
-    def _trigger_memory_cleanup(self):
-        """Trigger memory cleanup procedures"""
-        logger.info("Triggering memory cleanup...")
-        
-        # Force garbage collection
-        collected = gc.collect()
-        logger.info(f"Garbage collection freed {collected} objects")
-        
-        # Call custom cleanup callback if set
-        if self.cleanup_callback:
-            try:
-                self.cleanup_callback()
-                logger.info("Custom cleanup callback executed")
-            except Exception as e:
-                logger.error(f"Error in cleanup callback: {e}")
-        
-        # Log memory after cleanup
-        try:
-            usage_after = self.get_current_usage()
-            memory_after = usage_after.get('memory', {}).get('used_mb', 0)
-            logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
-        except Exception as e:
-            logger.error(f"Error checking memory after cleanup: {e}")
-    
-    def _log_resource_status(self, usage: Dict[str, Any]):
-        """Log current resource status"""
-        memory = usage.get('memory', {})
-        cpu = usage.get('cpu_percent', 0)
-        disk = usage.get('disk', {})
-        process_memory = usage.get('process_memory_mb', 0)
-        
-        logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
-                   f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
-    
-    def get_resource_summary(self) -> Dict[str, Any]:
-        """Get resource usage summary"""
-        if not self.resource_history:
-            return {}
-        
-        recent_usage = self.resource_history[-10:]  # Last 10 entries
-        
-        # Calculate averages
-        avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
-        avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
-        avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
-        
-        current = self.resource_history[-1] if self.resource_history else {}
-        
-        return {
-            'current': current,
-            'averages': {
-                'memory_mb': avg_memory,
-                'cpu_percent': avg_cpu,
-                'disk_percent': avg_disk
-            },
-            'thresholds': {
-                'memory_mb': self.memory_threshold_mb,
-                'cpu_percent': self.cpu_threshold_percent,
-                'disk_percent': self.disk_threshold_percent
-            },
-            'monitoring': self.monitoring,
-            'history_entries': len(self.resource_history)
-        }
-
-# Global instance
-_system_monitor = None
-
-def get_system_monitor() -> SystemResourceMonitor:
-    """Get global system monitor instance"""
-    global _system_monitor
-    if _system_monitor is None:
-        _system_monitor = SystemResourceMonitor()
-    return _system_monitor
-
-def start_system_monitoring():
-    """Start system monitoring with default settings"""
-    monitor = get_system_monitor()
-    monitor.start_monitoring()
-    return monitor
--- a/utils/tensorboard_logger.py
+++ b/utils/tensorboard_logger.py
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-"""
-TensorBoard Logger Utility
-
-This module provides a centralized way to log training metrics to TensorBoard.
-It ensures consistent logging across different training components.
-"""
-
-import os
-import logging
-from pathlib import Path
-from datetime import datetime
-from typing import Dict, Any, Optional, Union, List
-
-# Import conditionally to handle missing dependencies gracefully
-try:
-    from torch.utils.tensorboard import SummaryWriter
-    TENSORBOARD_AVAILABLE = True
-except ImportError:
-    TENSORBOARD_AVAILABLE = False
-
-logger = logging.getLogger(__name__)
-
-class TensorBoardLogger:
-    """
-    Centralized TensorBoard logging utility for training metrics
-    
-    This class provides a consistent interface for logging metrics to TensorBoard
-    across different training components.
-    """
-    
-    def __init__(self, 
-                 log_dir: Optional[str] = None, 
-                 experiment_name: Optional[str] = None,
-                 enabled: bool = True):
-        """
-        Initialize TensorBoard logger
-        
-        Args:
-            log_dir: Base directory for TensorBoard logs (default: 'runs')
-            experiment_name: Name of the experiment (default: timestamp)
-            enabled: Whether TensorBoard logging is enabled
-        """
-        self.enabled = enabled and TENSORBOARD_AVAILABLE
-        self.writer = None
-        
-        if not self.enabled:
-            if not TENSORBOARD_AVAILABLE:
-                logger.warning("TensorBoard not available. Install with: pip install tensorboard")
-            return
-        
-        # Set up log directory
-        if log_dir is None:
-            log_dir = "runs"
-        
-        # Create experiment name if not provided
-        if experiment_name is None:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            experiment_name = f"training_{timestamp}"
-        
-        # Create full log path
-        self.log_dir = os.path.join(log_dir, experiment_name)
-        
-        # Create writer
-        try:
-            self.writer = SummaryWriter(log_dir=self.log_dir)
-            logger.info(f"TensorBoard logging enabled at: {self.log_dir}")
-        except Exception as e:
-            logger.error(f"Failed to initialize TensorBoard: {e}")
-            self.enabled = False
-    
-    def log_scalar(self, tag: str, value: float, step: int) -> None:
-        """
-        Log a scalar value to TensorBoard
-        
-        Args:
-            tag: Metric name
-            value: Metric value
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        try:
-            self.writer.add_scalar(tag, value, step)
-        except Exception as e:
-            logger.warning(f"Failed to log scalar {tag}: {e}")
-    
-    def log_scalars(self, main_tag: str, tag_value_dict: Dict[str, float], step: int) -> None:
-        """
-        Log multiple scalar values with the same main tag
-        
-        Args:
-            main_tag: Main tag for the metrics
-            tag_value_dict: Dictionary of tag names to values
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        try:
-            self.writer.add_scalars(main_tag, tag_value_dict, step)
-        except Exception as e:
-            logger.warning(f"Failed to log scalars for {main_tag}: {e}")
-    
-    def log_histogram(self, tag: str, values, step: int) -> None:
-        """
-        Log a histogram to TensorBoard
-        
-        Args:
-            tag: Histogram name
-            values: Values to create histogram from
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        try:
-            self.writer.add_histogram(tag, values, step)
-        except Exception as e:
-            logger.warning(f"Failed to log histogram {tag}: {e}")
-    
-    def log_training_metrics(self, 
-                           metrics: Dict[str, Any], 
-                           step: int,
-                           prefix: str = "Training") -> None:
-        """
-        Log training metrics to TensorBoard
-        
-        Args:
-            metrics: Dictionary of metric names to values
-            step: Training step
-            prefix: Prefix for metric names
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        for name, value in metrics.items():
-            if isinstance(value, (int, float)):
-                self.log_scalar(f"{prefix}/{name}", value, step)
-            elif hasattr(value, "shape"):  # For numpy arrays or tensors
-                try:
-                    self.log_histogram(f"{prefix}/{name}", value, step)
-                except:
-                    pass
-    
-    def log_model_metrics(self,
-                        model_name: str,
-                        metrics: Dict[str, Any],
-                        step: int) -> None:
-        """
-        Log model-specific metrics to TensorBoard
-        
-        Args:
-            model_name: Name of the model
-            metrics: Dictionary of metric names to values
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        for name, value in metrics.items():
-            if isinstance(value, (int, float)):
-                self.log_scalar(f"Model/{model_name}/{name}", value, step)
-    
-    def log_reward_metrics(self,
-                         symbol: str,
-                         metrics: Dict[str, float],
-                         step: int) -> None:
-        """
-        Log reward-related metrics to TensorBoard
-        
-        Args:
-            symbol: Trading symbol
-            metrics: Dictionary of metric names to values
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        for name, value in metrics.items():
-            self.log_scalar(f"Rewards/{symbol}/{name}", value, step)
-    
-    def log_state_metrics(self,
-                        symbol: str,
-                        state_info: Dict[str, Any],
-                        step: int) -> None:
-        """
-        Log state-related metrics to TensorBoard
-        
-        Args:
-            symbol: Trading symbol
-            state_info: Dictionary of state information
-            step: Training step
-        """
-        if not self.enabled or self.writer is None:
-            return
-        
-        # Log state size
-        if "size" in state_info:
-            self.log_scalar(f"State/{symbol}/Size", state_info["size"], step)
-        
-        # Log state quality
-        if "quality" in state_info:
-            self.log_scalar(f"State/{symbol}/Quality", state_info["quality"], step)
-        
-        # Log feature counts
-        if "feature_counts" in state_info:
-            for feature_type, count in state_info["feature_counts"].items():
-                self.log_scalar(f"State/{symbol}/Features/{feature_type}", count, step)
-    
-    def close(self) -> None:
-        """Close the TensorBoard writer"""
-        if self.enabled and self.writer is not None:
-            try:
-                self.writer.close()
-                logger.info("TensorBoard writer closed")
-            except Exception as e:
-                logger.warning(f"Error closing TensorBoard writer: {e}")