gogo2/setup_ollama_hf_runner.sh

#!/bin/bash

# Ollama-based Hugging Face Model Runner
# Alternative solution with excellent parallelism and HF integration

set -e

echo "=== Ollama Hugging Face Model Runner Setup ==="
echo "High-performance alternative with excellent parallelism"
echo ""

# Install Ollama
if ! command -v ollama &> /dev/null; then
    echo "Installing Ollama..."
    curl -fsSL https://ollama.com/install.sh | sh
    echo "Ollama installed successfully!"
else
    echo "Ollama is already installed."
    ollama --version
fi

# Start Ollama service
echo "Starting Ollama service..."
ollama serve &
OLLAMA_PID=$!

# Wait for service to start
echo "Waiting for Ollama to start..."
sleep 5

# Create model management script
cat > manage_hf_models.sh << 'EOF'
#!/bin/bash

# Hugging Face Model Manager for Ollama
# Downloads and manages HF models with Ollama

MODEL_NAME=""
REPO_ID=""

show_help() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  -r, --repo REPO_ID     Hugging Face repository ID (e.g., microsoft/DialoGPT-medium)"
    echo "  -n, --name MODEL_NAME  Local model name for Ollama"
    echo "  -l, --list            List available models"
    echo "  -h, --help            Show this help"
    echo ""
    echo "Examples:"
    echo "  $0 -r microsoft/DialoGPT-medium -n dialogpt-medium"
    echo "  $0 -r microsoft/DialoGPT-large -n dialogpt-large"
    echo "  $0 -l"
}

list_models() {
    echo "=== Available Ollama Models ==="
    ollama list
    echo ""
    echo "=== Popular Hugging Face Models Compatible with Ollama ==="
    echo "- microsoft/DialoGPT-medium"
    echo "- microsoft/DialoGPT-large"
    echo "- microsoft/DialoGPT-small"
    echo "- facebook/blenderbot-400M-distill"
    echo "- facebook/blenderbot-1B-distill"
    echo "- facebook/blenderbot-3B"
    echo "- EleutherAI/gpt-neo-125M"
    echo "- EleutherAI/gpt-neo-1.3B"
    echo "- EleutherAI/gpt-neo-2.7B"
}

download_model() {
    if [[ -z "$REPO_ID" || -z "$MODEL_NAME" ]]; then
        echo "Error: Both repository ID and model name are required"
        show_help
        exit 1
    fi

    echo "=== Downloading Hugging Face Model ==="
    echo "Repository: $REPO_ID"
    echo "Local name: $MODEL_NAME"
    echo ""

    # Create Modelfile for the HF model
    cat > Modelfile << MODELFILE
FROM $REPO_ID

# Set parameters for better performance
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_ctx 4096

# Enable parallelism
PARAMETER num_thread 8
PARAMETER num_gpu 1
MODELFILE

    echo "Created Modelfile for $MODEL_NAME"
    echo "Pulling model from Hugging Face..."

    # Pull the model
    ollama create "$MODEL_NAME" -f Modelfile

    echo "Model $MODEL_NAME created successfully!"
    echo ""
    echo "You can now run: ollama run $MODEL_NAME"
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        -r|--repo)
            REPO_ID="$2"
            shift 2
            ;;
        -n|--name)
            MODEL_NAME="$2"
            shift 2
            ;;
        -l|--list)
            list_models
            exit 0
            ;;
        -h|--help)
            show_help
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            show_help
            exit 1
            ;;
    esac
done

# If no arguments provided, show help
if [[ $# -eq 0 ]]; then
    show_help
    exit 0
fi

# Download model if both parameters provided
if [[ -n "$REPO_ID" && -n "$MODEL_NAME" ]]; then
    download_model
fi
EOF

chmod +x manage_hf_models.sh

# Create performance test script
cat > test_performance.sh << 'EOF'
#!/bin/bash

# Performance test for Ollama models
# Tests parallelism and throughput

MODEL_NAME=${1:-"dialogpt-medium"}
CONCURRENT_REQUESTS=${2:-5}
TOTAL_REQUESTS=${3:-20}

echo "=== Ollama Performance Test ==="
echo "Model: $MODEL_NAME"
echo "Concurrent requests: $CONCURRENT_REQUESTS"
echo "Total requests: $TOTAL_REQUESTS"
echo ""

# Test function
test_request() {
    local request_id=$1
    local prompt="Test prompt $request_id: What is the meaning of life?"

    echo "Starting request $request_id..."
    start_time=$(date +%s.%N)

    response=$(ollama run "$MODEL_NAME" "$prompt" 2>/dev/null)

    end_time=$(date +%s.%N)
    duration=$(echo "$end_time - $start_time" | bc)

    echo "Request $request_id completed in ${duration}s"
    echo "$duration"
}

# Run concurrent tests
echo "Starting performance test..."
start_time=$(date +%s.%N)

# Create array to store PIDs
pids=()

# Launch concurrent requests
for i in $(seq 1 $TOTAL_REQUESTS); do
    test_request $i &
    pids+=($!)

    # Limit concurrent requests
    if (( i % CONCURRENT_REQUESTS == 0 )); then
        # Wait for current batch to complete
        for pid in "${pids[@]}"; do
            wait $pid
        done
        pids=()
    fi
done

# Wait for remaining requests
for pid in "${pids[@]}"; do
    wait $pid
done

end_time=$(date +%s.%N)
total_duration=$(echo "$end_time - $start_time" | bc)

echo ""
echo "=== Performance Test Results ==="
echo "Total time: ${total_duration}s"
echo "Requests per second: $(echo "scale=2; $TOTAL_REQUESTS / $total_duration" | bc)"
echo "Average time per request: $(echo "scale=2; $total_duration / $TOTAL_REQUESTS" | bc)s"
EOF

chmod +x test_performance.sh

# Create Docker integration script
cat > docker_ollama.sh << 'EOF'
#!/bin/bash

# Docker integration for Ollama
# Run Ollama in Docker with GPU support

echo "=== Docker Ollama Setup ==="
echo ""

# Create Docker Compose for Ollama
cat > docker-compose-ollama.yml << 'COMPOSE'
version: '3.8'

services:
  ollama:
    image: ollama/ollama:latest
    container_name: ollama-hf-runner
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    environment:
      - OLLAMA_HOST=0.0.0.0
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    restart: unless-stopped
    command: serve

volumes:
  ollama_data:
COMPOSE

echo "Created Docker Compose configuration"
echo ""
echo "To start Ollama in Docker:"
echo "  docker-compose -f docker-compose-ollama.yml up -d"
echo ""
echo "To pull a model:"
echo "  docker exec -it ollama-hf-runner ollama pull llama2"
echo ""
echo "To run a model:"
echo "  docker exec -it ollama-hf-runner ollama run llama2"
EOF

chmod +x docker_ollama.sh

echo ""
echo "=== Ollama Setup Complete! ==="
echo ""
echo "=== Available Commands ==="
echo "1. Manage HF models:"
echo "   ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
echo ""
echo "2. List available models:"
echo "   ./manage_hf_models.sh -l"
echo ""
echo "3. Test performance:"
echo "   ./test_performance.sh dialogpt-medium 5 20"
echo ""
echo "4. Docker integration:"
echo "   ./docker_ollama.sh"
echo ""
echo "=== Quick Start ==="
echo "1. Download a model:"
echo "   ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
echo ""
echo "2. Run the model:"
echo "   ollama run dialogpt-medium"
echo ""
echo "3. Test with API:"
echo "   curl http://localhost:11434/api/generate -d '{\"model\": \"dialogpt-medium\", \"prompt\": \"Hello!\"}'"
echo ""
echo "=== Parallelism Features ==="
echo "- Multi-threading support"
echo "- GPU acceleration (if available)"
echo "- Concurrent request handling"
echo "- Batch processing"
echo "- Docker integration with GPU support"