try fixing GPU (torch)

2025-11-17 13:06:37 +02:00
parent 4fcadcdbff
commit 43a7d75daf
9 changed files with 1393 additions and 11 deletions
--- a/scripts/attach-to-rocm-container.sh
+++ b/scripts/attach-to-rocm-container.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# Attach to existing AMD Strix Halo ROCm container for development
+
+set -e
+
+CONTAINER_NAME="amd-strix-halo-llama-rocm"
+PROJECT_PATH="/mnt/shared/DEV/repos/d-popov.com/gogo2"
+CONTAINER_PROJECT_PATH="/workspace/gogo2"
+
+echo "=================================================="
+echo "  Attaching to AMD Strix Halo ROCm Container"
+echo "=================================================="
+echo ""
+
+# Check if container exists and is running
+if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+    echo "❌ Container '${CONTAINER_NAME}' is not running"
+    echo ""
+    echo "Available containers:"
+    docker ps --format "  - {{.Names}} ({{.Status}})"
+    echo ""
+    echo "To start it: docker start ${CONTAINER_NAME}"
+    exit 1
+fi
+
+echo "✓ Container is running"
+echo ""
+
+# Check if project is already mounted or accessible
+echo "Checking project accessibility..."
+if docker exec $CONTAINER_NAME test -d "$CONTAINER_PROJECT_PATH" 2>/dev/null; then
+    echo "✓ Project already accessible at: $CONTAINER_PROJECT_PATH"
+else
+    echo "⚠️  Project not mounted in container"
+    echo ""
+    echo "OPTION 1: Mount project directory (requires container restart)"
+    echo "  Add to docker-compose or docker run:"
+    echo "    -v $PROJECT_PATH:$CONTAINER_PROJECT_PATH"
+    echo ""
+    echo "OPTION 2: Copy project into container"
+    echo "  docker cp $PROJECT_PATH $CONTAINER_NAME:/workspace/"
+    echo ""
+    echo "OPTION 3: Work from host's home directory mount"
+    echo "  (if accessible via /home/db/...)"
+    echo ""
+    read -p "Copy project to container now? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        echo "Copying project to container..."
+        docker exec $CONTAINER_NAME mkdir -p /workspace
+        docker cp $PROJECT_PATH $CONTAINER_NAME:/workspace/
+        echo "✓ Project copied"
+    else
+        echo "Skipping project copy"
+    fi
+fi
+echo ""
+
+# Check for Python
+echo "Checking Python installation..."
+if docker exec $CONTAINER_NAME which python3 &>/dev/null; then
+    PYTHON_VERSION=$(docker exec $CONTAINER_NAME python3 --version)
+    echo "✓ Python installed: $PYTHON_VERSION"
+else
+    echo "⚠️  Python not installed in container"
+    echo ""
+    echo "Install Python? (Fedora-based container)"
+    read -p "Install Python 3.12 + pip? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        echo "Installing Python..."
+        docker exec $CONTAINER_NAME dnf install -y python3.12 python3-pip python3-devel git
+        docker exec $CONTAINER_NAME ln -sf /usr/bin/python3.12 /usr/bin/python3
+        docker exec $CONTAINER_NAME ln -sf /usr/bin/python3.12 /usr/bin/python
+        echo "✓ Python installed"
+    else
+        echo "Skipping Python installation"
+    fi
+fi
+echo ""
+
+# Check for PyTorch ROCm
+echo "Checking PyTorch..."
+if docker exec $CONTAINER_NAME python3 -c "import torch" &>/dev/null; then
+    TORCH_INFO=$(docker exec $CONTAINER_NAME python3 -c "import torch; print(f'{torch.__version__}, CUDA: {torch.cuda.is_available()}')")
+    echo "✓ PyTorch installed: $TORCH_INFO"
+else
+    echo "⚠️  PyTorch not installed"
+    echo ""
+    echo "This container should have ROCm support built-in"
+    echo "Install PyTorch with ROCm support?"
+    read -p "Install PyTorch ROCm? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        echo "Installing PyTorch with ROCm..."
+        docker exec $CONTAINER_NAME pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.2
+        echo "✓ PyTorch installed"
+    fi
+fi
+echo ""
+
+echo "=================================================="
+echo "  Ready to Attach!"
+echo "=================================================="
+echo ""
+echo "Container: $CONTAINER_NAME"
+echo "Project:   $CONTAINER_PROJECT_PATH"
+echo ""
+echo "Attaching to container shell..."
+echo "(Use 'exit' or Ctrl+D to detach)"
+echo ""
+echo "Once inside, navigate to:"
+echo "  cd $CONTAINER_PROJECT_PATH"
+echo ""
+echo "Install project dependencies:"
+echo "  pip3 install -r requirements.txt"
+echo ""
+echo "Run ANNOTATE:"
+echo "  python3 ANNOTATE/web/app.py"
+echo ""
+echo "=================================================="
+echo ""
+
+# Attach to container
+docker exec -it $CONTAINER_NAME bash
+
+
+
--- a/scripts/setup-pytorch.sh
+++ b/scripts/setup-pytorch.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# Automatic PyTorch installation script
+# Detects hardware and installs the appropriate PyTorch build
+# Works with: NVIDIA (CUDA), AMD (ROCm), or CPU-only
+
+set -e
+
+echo "=================================================="
+echo "  PyTorch Auto-Setup for Trading System"
+echo "=================================================="
+echo ""
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Detect GPU hardware
+detect_hardware() {
+    echo "Detecting GPU hardware..."
+    
+    # Check for NVIDIA GPU
+    if command -v nvidia-smi &> /dev/null; then
+        if nvidia-smi &> /dev/null; then
+            echo -e "${GREEN}✓ NVIDIA GPU detected${NC}"
+            CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d. -f1,2)
+            echo "  CUDA Version: $CUDA_VERSION"
+            GPU_TYPE="nvidia"
+            return
+        fi
+    fi
+    
+    # Check for AMD GPU
+    if lspci 2>/dev/null | grep -iE "VGA|3D|Display" | grep -iq "AMD\|ATI"; then
+        echo -e "${GREEN}✓ AMD GPU detected${NC}"
+        GPU_MODEL=$(lspci | grep -iE "VGA|3D|Display" | grep -i "AMD\|ATI" | head -1)
+        echo "  $GPU_MODEL"
+        
+        # Check if ROCm is available
+        if command -v rocm-smi &> /dev/null; then
+            ROCM_VERSION=$(rocm-smi --version 2>/dev/null | grep "ROCm" | awk '{print $3}' || echo "unknown")
+            echo "  ROCm installed: $ROCM_VERSION"
+        else
+            echo -e "${YELLOW}  ⚠ ROCm not detected - will install ROCm PyTorch anyway${NC}"
+        fi
+        
+        GPU_TYPE="amd"
+        return
+    fi
+    
+    # No GPU detected
+    echo -e "${YELLOW}⚠ No GPU detected - will use CPU-only build${NC}"
+    GPU_TYPE="cpu"
+}
+
+# Check if PyTorch is already installed
+check_existing_pytorch() {
+    if python -c "import torch" 2>/dev/null; then
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
+        GPU_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
+        
+        echo ""
+        echo "PyTorch is already installed:"
+        echo "  Version: $TORCH_VERSION"
+        echo "  GPU available: $GPU_AVAILABLE"
+        echo ""
+        
+        read -p "Reinstall PyTorch? (y/N): " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            echo "Keeping existing PyTorch installation"
+            exit 0
+        fi
+        
+        echo "Uninstalling existing PyTorch..."
+        pip uninstall -y torch 2>/dev/null || true
+    fi
+}
+
+# Install PyTorch based on hardware
+install_pytorch() {
+    echo ""
+    echo "Installing PyTorch for $GPU_TYPE..."
+    echo ""
+    
+    case $GPU_TYPE in
+        nvidia)
+            # Determine CUDA version to use
+            if [[ "$CUDA_VERSION" == "12.1" ]] || [[ "$CUDA_VERSION" == "12.2" ]] || [[ "$CUDA_VERSION" == "12.3" ]]; then
+                CUDA_BUILD="cu121"
+            elif [[ "$CUDA_VERSION" == "12.4" ]] || [[ "$CUDA_VERSION" == "12.5" ]] || [[ "$CUDA_VERSION" == "12.6" ]]; then
+                CUDA_BUILD="cu124"
+            elif [[ "$CUDA_VERSION" == "11."* ]]; then
+                CUDA_BUILD="cu118"
+            else
+                echo -e "${YELLOW}⚠ Unknown CUDA version, using CUDA 12.1 build${NC}"
+                CUDA_BUILD="cu121"
+            fi
+            
+            echo "Installing PyTorch with CUDA $CUDA_BUILD support..."
+            pip install torch --index-url https://download.pytorch.org/whl/$CUDA_BUILD
+            ;;
+            
+        amd)
+            echo "Installing PyTorch with ROCm 6.2 support..."
+            echo "(This works with RDNA 2, RDNA 3, and newer AMD GPUs)"
+            pip install torch --index-url https://download.pytorch.org/whl/rocm6.2
+            ;;
+            
+        cpu)
+            echo "Installing CPU-only PyTorch..."
+            pip install torch --index-url https://download.pytorch.org/whl/cpu
+            ;;
+    esac
+}
+
+# Verify installation
+verify_installation() {
+    echo ""
+    echo "Verifying installation..."
+    echo ""
+    
+    if ! python -c "import torch" 2>/dev/null; then
+        echo -e "${RED}✗ PyTorch installation failed!${NC}"
+        exit 1
+    fi
+    
+    TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
+    GPU_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
+    
+    echo -e "${GREEN}✓ PyTorch installed successfully!${NC}"
+    echo "  Version: $TORCH_VERSION"
+    echo "  GPU available: $GPU_AVAILABLE"
+    
+    if [[ "$GPU_AVAILABLE" == "True" ]]; then
+        DEVICE_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
+        DEVICE_COUNT=$(python -c "import torch; print(torch.cuda.device_count())")
+        MEMORY_GB=$(python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}')")
+        
+        echo -e "${GREEN}  Device: $DEVICE_NAME${NC}"
+        echo "  Count: $DEVICE_COUNT"
+        echo "  Memory: ${MEMORY_GB} GB"
+        
+        case $GPU_TYPE in
+            nvidia)
+                echo ""
+                echo "🚀 Training will be 5-10x faster with NVIDIA GPU!"
+                ;;
+            amd)
+                echo ""
+                echo "🚀 Training will be 2-3x faster with AMD GPU!"
+                ;;
+        esac
+    else
+        if [[ "$GPU_TYPE" != "cpu" ]]; then
+            echo -e "${YELLOW}⚠ GPU detected but not available in PyTorch${NC}"
+            echo "  This might mean:"
+            echo "  - GPU drivers need to be installed/updated"
+            echo "  - Wrong PyTorch build was installed"
+            echo "  - GPU is not supported"
+        else
+            echo "  CPU-only mode (slower training)"
+        fi
+    fi
+    
+    echo ""
+    echo "=================================================="
+    echo "✓ Setup complete!"
+    echo "=================================================="
+    echo ""
+    echo "Test your setup:"
+    echo "  python -c \"import torch; print(f'GPU: {torch.cuda.is_available()}')\""
+    echo ""
+    echo "Start ANNOTATE:"
+    echo "  python ANNOTATE/web/app.py"
+    echo ""
+}
+
+# Main execution
+main() {
+    # Check if we're in a virtual environment
+    if [[ -z "$VIRTUAL_ENV" ]]; then
+        echo -e "${YELLOW}⚠ Not in a virtual environment${NC}"
+        echo ""
+        echo "It's recommended to use a virtual environment:"
+        echo "  python -m venv venv"
+        echo "  source venv/bin/activate  # Linux/Mac"
+        echo "  .\\venv\\Scripts\\activate   # Windows"
+        echo ""
+        read -p "Continue anyway? (y/N): " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            exit 1
+        fi
+    else
+        echo -e "${GREEN}✓ Virtual environment active: $VIRTUAL_ENV${NC}"
+        echo ""
+    fi
+    
+    detect_hardware
+    check_existing_pytorch
+    install_pytorch
+    verify_installation
+}
+
+# Run main function
+main
+