gogo2/utils/launch_tensorboard.py

#!/usr/bin/env python3
"""
TensorBoard Launcher with Automatic Port Management

This script launches TensorBoard with automatic port fallback if the preferred port is in use.
It also kills any stale debug instances that might be running.

Usage:
    python launch_tensorboard.py --logdir=path/to/logs --preferred-port=6007 --port-range=6000-7000
"""

import os
import sys
import subprocess
import argparse
import logging
from pathlib import Path

# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.port_manager import get_port_with_fallback, kill_stale_debug_instances

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('tensorboard_launcher')

def launch_tensorboard(logdir, port, host='localhost', open_browser=True):
    """
    Launch TensorBoard on the specified port

    Args:
        logdir (str): Path to log directory
        port (int): Port to use
        host (str): Host to bind to
        open_browser (bool): Whether to open browser automatically

    Returns:
        subprocess.Popen: Process object
    """
    cmd = [
        sys.executable, "-m", "tensorboard.main",
        f"--logdir={logdir}",
        f"--port={port}",
        f"--host={host}"
    ]

    # Add --load_fast=false to improve startup times
    cmd.append("--load_fast=false")

    # Control whether to open browser
    if not open_browser:
        cmd.append("--window_title=TensorBoard")

    logger.info(f"Launching TensorBoard: {' '.join(cmd)}")

    # Use subprocess.Popen to start TensorBoard without waiting for it to finish
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
        bufsize=1
    )

    # Log the first few lines of output to confirm it's starting correctly
    line_count = 0
    for line in process.stdout:
        logger.info(f"TensorBoard: {line.strip()}")
        line_count += 1

        # Check if TensorBoard has started successfully
        if "TensorBoard" in line and "http://" in line:
            url = line.strip().split("http://")[1].split(" ")[0]
            logger.info(f"TensorBoard available at: http://{url}")

        # Only log the first few lines
        if line_count >= 10:
            break

    # Continue reading output in background to prevent pipe from filling
    def read_output():
        for line in process.stdout:
            pass

    import threading
    threading.Thread(target=read_output, daemon=True).start()

    return process

def main():
    parser = argparse.ArgumentParser(description='Launch TensorBoard with automatic port management')
    parser.add_argument('--logdir', type=str, default='NN/models/saved/logs',
                       help='Directory containing TensorBoard event files')
    parser.add_argument('--preferred-port', type=int, default=6007,
                       help='Preferred port to use')
    parser.add_argument('--port-range', type=str, default='6000-7000',
                       help='Port range to try if preferred port is unavailable (format: min-max)')
    parser.add_argument('--host', type=str, default='localhost',
                       help='Host to bind to')
    parser.add_argument('--no-browser', action='store_true',
                       help='Do not open browser automatically')
    parser.add_argument('--kill-stale', action='store_true',
                       help='Kill stale debug instances before starting')

    args = parser.parse_args()

    # Parse port range
    try:
        min_port, max_port = map(int, args.port_range.split('-'))
    except ValueError:
        logger.error(f"Invalid port range format: {args.port_range}. Use format: min-max")
        return 1

    # Kill stale instances if requested
    if args.kill_stale:
        logger.info("Killing stale debug instances...")
        count, _ = kill_stale_debug_instances()
        logger.info(f"Killed {count} stale instances")

    # Get an available port
    try:
        port = get_port_with_fallback(args.preferred_port, min_port, max_port)
        logger.info(f"Using port {port} for TensorBoard")
    except RuntimeError as e:
        logger.error(str(e))
        return 1

    # Ensure log directory exists
    logdir = os.path.abspath(args.logdir)
    os.makedirs(logdir, exist_ok=True)

    # Launch TensorBoard
    process = launch_tensorboard(
        logdir=logdir,
        port=port,
        host=args.host,
        open_browser=not args.no_browser
    )

    # Wait for process to end (it shouldn't unless there's an error or user kills it)
    try:
        return_code = process.wait()
        if return_code != 0:
            logger.error(f"TensorBoard exited with code {return_code}")
            return return_code
    except KeyboardInterrupt:
        logger.info("Received keyboard interrupt, shutting down TensorBoard...")
        process.terminate()
        try:
            process.wait(timeout=5)
        except subprocess.TimeoutExpired:
            logger.warning("TensorBoard didn't terminate gracefully, forcing kill")
            process.kill()

    return 0

if __name__ == "__main__":
    sys.exit(main())