try fixing GPU (torch)

This commit is contained in:
Dobromir Popov
2025-11-17 13:06:37 +02:00
parent 4fcadcdbff
commit 43a7d75daf
9 changed files with 1393 additions and 11 deletions

View File

@@ -0,0 +1,128 @@
#!/bin/bash
# Attach to existing AMD Strix Halo ROCm container for development
set -e
CONTAINER_NAME="amd-strix-halo-llama-rocm"
PROJECT_PATH="/mnt/shared/DEV/repos/d-popov.com/gogo2"
CONTAINER_PROJECT_PATH="/workspace/gogo2"
echo "=================================================="
echo " Attaching to AMD Strix Halo ROCm Container"
echo "=================================================="
echo ""
# Check if container exists and is running
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo "❌ Container '${CONTAINER_NAME}' is not running"
echo ""
echo "Available containers:"
docker ps --format " - {{.Names}} ({{.Status}})"
echo ""
echo "To start it: docker start ${CONTAINER_NAME}"
exit 1
fi
echo "✓ Container is running"
echo ""
# Check if project is already mounted or accessible
echo "Checking project accessibility..."
if docker exec $CONTAINER_NAME test -d "$CONTAINER_PROJECT_PATH" 2>/dev/null; then
echo "✓ Project already accessible at: $CONTAINER_PROJECT_PATH"
else
echo "⚠️ Project not mounted in container"
echo ""
echo "OPTION 1: Mount project directory (requires container restart)"
echo " Add to docker-compose or docker run:"
echo " -v $PROJECT_PATH:$CONTAINER_PROJECT_PATH"
echo ""
echo "OPTION 2: Copy project into container"
echo " docker cp $PROJECT_PATH $CONTAINER_NAME:/workspace/"
echo ""
echo "OPTION 3: Work from host's home directory mount"
echo " (if accessible via /home/db/...)"
echo ""
read -p "Copy project to container now? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Copying project to container..."
docker exec $CONTAINER_NAME mkdir -p /workspace
docker cp $PROJECT_PATH $CONTAINER_NAME:/workspace/
echo "✓ Project copied"
else
echo "Skipping project copy"
fi
fi
echo ""
# Check for Python
echo "Checking Python installation..."
if docker exec $CONTAINER_NAME which python3 &>/dev/null; then
PYTHON_VERSION=$(docker exec $CONTAINER_NAME python3 --version)
echo "✓ Python installed: $PYTHON_VERSION"
else
echo "⚠️ Python not installed in container"
echo ""
echo "Install Python? (Fedora-based container)"
read -p "Install Python 3.12 + pip? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Installing Python..."
docker exec $CONTAINER_NAME dnf install -y python3.12 python3-pip python3-devel git
docker exec $CONTAINER_NAME ln -sf /usr/bin/python3.12 /usr/bin/python3
docker exec $CONTAINER_NAME ln -sf /usr/bin/python3.12 /usr/bin/python
echo "✓ Python installed"
else
echo "Skipping Python installation"
fi
fi
echo ""
# Check for PyTorch ROCm
echo "Checking PyTorch..."
if docker exec $CONTAINER_NAME python3 -c "import torch" &>/dev/null; then
TORCH_INFO=$(docker exec $CONTAINER_NAME python3 -c "import torch; print(f'{torch.__version__}, CUDA: {torch.cuda.is_available()}')")
echo "✓ PyTorch installed: $TORCH_INFO"
else
echo "⚠️ PyTorch not installed"
echo ""
echo "This container should have ROCm support built-in"
echo "Install PyTorch with ROCm support?"
read -p "Install PyTorch ROCm? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Installing PyTorch with ROCm..."
docker exec $CONTAINER_NAME pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.2
echo "✓ PyTorch installed"
fi
fi
echo ""
echo "=================================================="
echo " Ready to Attach!"
echo "=================================================="
echo ""
echo "Container: $CONTAINER_NAME"
echo "Project: $CONTAINER_PROJECT_PATH"
echo ""
echo "Attaching to container shell..."
echo "(Use 'exit' or Ctrl+D to detach)"
echo ""
echo "Once inside, navigate to:"
echo " cd $CONTAINER_PROJECT_PATH"
echo ""
echo "Install project dependencies:"
echo " pip3 install -r requirements.txt"
echo ""
echo "Run ANNOTATE:"
echo " python3 ANNOTATE/web/app.py"
echo ""
echo "=================================================="
echo ""
# Attach to container
docker exec -it $CONTAINER_NAME bash

209
scripts/setup-pytorch.sh Normal file
View File

@@ -0,0 +1,209 @@
#!/bin/bash
# Automatic PyTorch installation script
# Detects hardware and installs the appropriate PyTorch build
# Works with: NVIDIA (CUDA), AMD (ROCm), or CPU-only
set -e
echo "=================================================="
echo " PyTorch Auto-Setup for Trading System"
echo "=================================================="
echo ""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Detect GPU hardware
detect_hardware() {
echo "Detecting GPU hardware..."
# Check for NVIDIA GPU
if command -v nvidia-smi &> /dev/null; then
if nvidia-smi &> /dev/null; then
echo -e "${GREEN}✓ NVIDIA GPU detected${NC}"
CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d. -f1,2)
echo " CUDA Version: $CUDA_VERSION"
GPU_TYPE="nvidia"
return
fi
fi
# Check for AMD GPU
if lspci 2>/dev/null | grep -iE "VGA|3D|Display" | grep -iq "AMD\|ATI"; then
echo -e "${GREEN}✓ AMD GPU detected${NC}"
GPU_MODEL=$(lspci | grep -iE "VGA|3D|Display" | grep -i "AMD\|ATI" | head -1)
echo " $GPU_MODEL"
# Check if ROCm is available
if command -v rocm-smi &> /dev/null; then
ROCM_VERSION=$(rocm-smi --version 2>/dev/null | grep "ROCm" | awk '{print $3}' || echo "unknown")
echo " ROCm installed: $ROCM_VERSION"
else
echo -e "${YELLOW} ⚠ ROCm not detected - will install ROCm PyTorch anyway${NC}"
fi
GPU_TYPE="amd"
return
fi
# No GPU detected
echo -e "${YELLOW}⚠ No GPU detected - will use CPU-only build${NC}"
GPU_TYPE="cpu"
}
# Check if PyTorch is already installed
check_existing_pytorch() {
if python -c "import torch" 2>/dev/null; then
TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
GPU_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
echo ""
echo "PyTorch is already installed:"
echo " Version: $TORCH_VERSION"
echo " GPU available: $GPU_AVAILABLE"
echo ""
read -p "Reinstall PyTorch? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Keeping existing PyTorch installation"
exit 0
fi
echo "Uninstalling existing PyTorch..."
pip uninstall -y torch 2>/dev/null || true
fi
}
# Install PyTorch based on hardware
install_pytorch() {
echo ""
echo "Installing PyTorch for $GPU_TYPE..."
echo ""
case $GPU_TYPE in
nvidia)
# Determine CUDA version to use
if [[ "$CUDA_VERSION" == "12.1" ]] || [[ "$CUDA_VERSION" == "12.2" ]] || [[ "$CUDA_VERSION" == "12.3" ]]; then
CUDA_BUILD="cu121"
elif [[ "$CUDA_VERSION" == "12.4" ]] || [[ "$CUDA_VERSION" == "12.5" ]] || [[ "$CUDA_VERSION" == "12.6" ]]; then
CUDA_BUILD="cu124"
elif [[ "$CUDA_VERSION" == "11."* ]]; then
CUDA_BUILD="cu118"
else
echo -e "${YELLOW}⚠ Unknown CUDA version, using CUDA 12.1 build${NC}"
CUDA_BUILD="cu121"
fi
echo "Installing PyTorch with CUDA $CUDA_BUILD support..."
pip install torch --index-url https://download.pytorch.org/whl/$CUDA_BUILD
;;
amd)
echo "Installing PyTorch with ROCm 6.2 support..."
echo "(This works with RDNA 2, RDNA 3, and newer AMD GPUs)"
pip install torch --index-url https://download.pytorch.org/whl/rocm6.2
;;
cpu)
echo "Installing CPU-only PyTorch..."
pip install torch --index-url https://download.pytorch.org/whl/cpu
;;
esac
}
# Verify installation
verify_installation() {
echo ""
echo "Verifying installation..."
echo ""
if ! python -c "import torch" 2>/dev/null; then
echo -e "${RED}✗ PyTorch installation failed!${NC}"
exit 1
fi
TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
GPU_AVAILABLE=$(python -c "import torch; print(torch.cuda.is_available())")
echo -e "${GREEN}✓ PyTorch installed successfully!${NC}"
echo " Version: $TORCH_VERSION"
echo " GPU available: $GPU_AVAILABLE"
if [[ "$GPU_AVAILABLE" == "True" ]]; then
DEVICE_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
DEVICE_COUNT=$(python -c "import torch; print(torch.cuda.device_count())")
MEMORY_GB=$(python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}')")
echo -e "${GREEN} Device: $DEVICE_NAME${NC}"
echo " Count: $DEVICE_COUNT"
echo " Memory: ${MEMORY_GB} GB"
case $GPU_TYPE in
nvidia)
echo ""
echo "🚀 Training will be 5-10x faster with NVIDIA GPU!"
;;
amd)
echo ""
echo "🚀 Training will be 2-3x faster with AMD GPU!"
;;
esac
else
if [[ "$GPU_TYPE" != "cpu" ]]; then
echo -e "${YELLOW}⚠ GPU detected but not available in PyTorch${NC}"
echo " This might mean:"
echo " - GPU drivers need to be installed/updated"
echo " - Wrong PyTorch build was installed"
echo " - GPU is not supported"
else
echo " CPU-only mode (slower training)"
fi
fi
echo ""
echo "=================================================="
echo "✓ Setup complete!"
echo "=================================================="
echo ""
echo "Test your setup:"
echo " python -c \"import torch; print(f'GPU: {torch.cuda.is_available()}')\""
echo ""
echo "Start ANNOTATE:"
echo " python ANNOTATE/web/app.py"
echo ""
}
# Main execution
main() {
# Check if we're in a virtual environment
if [[ -z "$VIRTUAL_ENV" ]]; then
echo -e "${YELLOW}⚠ Not in a virtual environment${NC}"
echo ""
echo "It's recommended to use a virtual environment:"
echo " python -m venv venv"
echo " source venv/bin/activate # Linux/Mac"
echo " .\\venv\\Scripts\\activate # Windows"
echo ""
read -p "Continue anyway? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
else
echo -e "${GREEN}✓ Virtual environment active: $VIRTUAL_ENV${NC}"
echo ""
fi
detect_hardware
check_existing_pytorch
install_pytorch
verify_installation
}
# Run main function
main