309 lines
7.3 KiB
Bash
309 lines
7.3 KiB
Bash
#!/bin/bash
|
|
|
|
# Ollama-based Hugging Face Model Runner
|
|
# Alternative solution with excellent parallelism and HF integration
|
|
|
|
set -e
|
|
|
|
echo "=== Ollama Hugging Face Model Runner Setup ==="
|
|
echo "High-performance alternative with excellent parallelism"
|
|
echo ""
|
|
|
|
# Install Ollama
|
|
if ! command -v ollama &> /dev/null; then
|
|
echo "Installing Ollama..."
|
|
curl -fsSL https://ollama.com/install.sh | sh
|
|
echo "Ollama installed successfully!"
|
|
else
|
|
echo "Ollama is already installed."
|
|
ollama --version
|
|
fi
|
|
|
|
# Start Ollama service
|
|
echo "Starting Ollama service..."
|
|
ollama serve &
|
|
OLLAMA_PID=$!
|
|
|
|
# Wait for service to start
|
|
echo "Waiting for Ollama to start..."
|
|
sleep 5
|
|
|
|
# Create model management script
|
|
cat > manage_hf_models.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
# Hugging Face Model Manager for Ollama
|
|
# Downloads and manages HF models with Ollama
|
|
|
|
MODEL_NAME=""
|
|
REPO_ID=""
|
|
|
|
show_help() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " -r, --repo REPO_ID Hugging Face repository ID (e.g., microsoft/DialoGPT-medium)"
|
|
echo " -n, --name MODEL_NAME Local model name for Ollama"
|
|
echo " -l, --list List available models"
|
|
echo " -h, --help Show this help"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 -r microsoft/DialoGPT-medium -n dialogpt-medium"
|
|
echo " $0 -r microsoft/DialoGPT-large -n dialogpt-large"
|
|
echo " $0 -l"
|
|
}
|
|
|
|
list_models() {
|
|
echo "=== Available Ollama Models ==="
|
|
ollama list
|
|
echo ""
|
|
echo "=== Popular Hugging Face Models Compatible with Ollama ==="
|
|
echo "- microsoft/DialoGPT-medium"
|
|
echo "- microsoft/DialoGPT-large"
|
|
echo "- microsoft/DialoGPT-small"
|
|
echo "- facebook/blenderbot-400M-distill"
|
|
echo "- facebook/blenderbot-1B-distill"
|
|
echo "- facebook/blenderbot-3B"
|
|
echo "- EleutherAI/gpt-neo-125M"
|
|
echo "- EleutherAI/gpt-neo-1.3B"
|
|
echo "- EleutherAI/gpt-neo-2.7B"
|
|
}
|
|
|
|
download_model() {
|
|
if [[ -z "$REPO_ID" || -z "$MODEL_NAME" ]]; then
|
|
echo "Error: Both repository ID and model name are required"
|
|
show_help
|
|
exit 1
|
|
fi
|
|
|
|
echo "=== Downloading Hugging Face Model ==="
|
|
echo "Repository: $REPO_ID"
|
|
echo "Local name: $MODEL_NAME"
|
|
echo ""
|
|
|
|
# Create Modelfile for the HF model
|
|
cat > Modelfile << MODELFILE
|
|
FROM $REPO_ID
|
|
|
|
# Set parameters for better performance
|
|
PARAMETER temperature 0.7
|
|
PARAMETER top_p 0.9
|
|
PARAMETER top_k 40
|
|
PARAMETER repeat_penalty 1.1
|
|
PARAMETER num_ctx 4096
|
|
|
|
# Enable parallelism
|
|
PARAMETER num_thread 8
|
|
PARAMETER num_gpu 1
|
|
MODELFILE
|
|
|
|
echo "Created Modelfile for $MODEL_NAME"
|
|
echo "Pulling model from Hugging Face..."
|
|
|
|
# Pull the model
|
|
ollama create "$MODEL_NAME" -f Modelfile
|
|
|
|
echo "Model $MODEL_NAME created successfully!"
|
|
echo ""
|
|
echo "You can now run: ollama run $MODEL_NAME"
|
|
}
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-r|--repo)
|
|
REPO_ID="$2"
|
|
shift 2
|
|
;;
|
|
-n|--name)
|
|
MODEL_NAME="$2"
|
|
shift 2
|
|
;;
|
|
-l|--list)
|
|
list_models
|
|
exit 0
|
|
;;
|
|
-h|--help)
|
|
show_help
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
show_help
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# If no arguments provided, show help
|
|
if [[ $# -eq 0 ]]; then
|
|
show_help
|
|
exit 0
|
|
fi
|
|
|
|
# Download model if both parameters provided
|
|
if [[ -n "$REPO_ID" && -n "$MODEL_NAME" ]]; then
|
|
download_model
|
|
fi
|
|
EOF
|
|
|
|
chmod +x manage_hf_models.sh
|
|
|
|
# Create performance test script
|
|
cat > test_performance.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
# Performance test for Ollama models
|
|
# Tests parallelism and throughput
|
|
|
|
MODEL_NAME=${1:-"dialogpt-medium"}
|
|
CONCURRENT_REQUESTS=${2:-5}
|
|
TOTAL_REQUESTS=${3:-20}
|
|
|
|
echo "=== Ollama Performance Test ==="
|
|
echo "Model: $MODEL_NAME"
|
|
echo "Concurrent requests: $CONCURRENT_REQUESTS"
|
|
echo "Total requests: $TOTAL_REQUESTS"
|
|
echo ""
|
|
|
|
# Test function
|
|
test_request() {
|
|
local request_id=$1
|
|
local prompt="Test prompt $request_id: What is the meaning of life?"
|
|
|
|
echo "Starting request $request_id..."
|
|
start_time=$(date +%s.%N)
|
|
|
|
response=$(ollama run "$MODEL_NAME" "$prompt" 2>/dev/null)
|
|
|
|
end_time=$(date +%s.%N)
|
|
duration=$(echo "$end_time - $start_time" | bc)
|
|
|
|
echo "Request $request_id completed in ${duration}s"
|
|
echo "$duration"
|
|
}
|
|
|
|
# Run concurrent tests
|
|
echo "Starting performance test..."
|
|
start_time=$(date +%s.%N)
|
|
|
|
# Create array to store PIDs
|
|
pids=()
|
|
|
|
# Launch concurrent requests
|
|
for i in $(seq 1 $TOTAL_REQUESTS); do
|
|
test_request $i &
|
|
pids+=($!)
|
|
|
|
# Limit concurrent requests
|
|
if (( i % CONCURRENT_REQUESTS == 0 )); then
|
|
# Wait for current batch to complete
|
|
for pid in "${pids[@]}"; do
|
|
wait $pid
|
|
done
|
|
pids=()
|
|
fi
|
|
done
|
|
|
|
# Wait for remaining requests
|
|
for pid in "${pids[@]}"; do
|
|
wait $pid
|
|
done
|
|
|
|
end_time=$(date +%s.%N)
|
|
total_duration=$(echo "$end_time - $start_time" | bc)
|
|
|
|
echo ""
|
|
echo "=== Performance Test Results ==="
|
|
echo "Total time: ${total_duration}s"
|
|
echo "Requests per second: $(echo "scale=2; $TOTAL_REQUESTS / $total_duration" | bc)"
|
|
echo "Average time per request: $(echo "scale=2; $total_duration / $TOTAL_REQUESTS" | bc)s"
|
|
EOF
|
|
|
|
chmod +x test_performance.sh
|
|
|
|
# Create Docker integration script
|
|
cat > docker_ollama.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
# Docker integration for Ollama
|
|
# Run Ollama in Docker with GPU support
|
|
|
|
echo "=== Docker Ollama Setup ==="
|
|
echo ""
|
|
|
|
# Create Docker Compose for Ollama
|
|
cat > docker-compose-ollama.yml << 'COMPOSE'
|
|
version: '3.8'
|
|
|
|
services:
|
|
ollama:
|
|
image: ollama/ollama:latest
|
|
container_name: ollama-hf-runner
|
|
ports:
|
|
- "11434:11434"
|
|
volumes:
|
|
- ollama_data:/root/.ollama
|
|
environment:
|
|
- OLLAMA_HOST=0.0.0.0
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
restart: unless-stopped
|
|
command: serve
|
|
|
|
volumes:
|
|
ollama_data:
|
|
COMPOSE
|
|
|
|
echo "Created Docker Compose configuration"
|
|
echo ""
|
|
echo "To start Ollama in Docker:"
|
|
echo " docker-compose -f docker-compose-ollama.yml up -d"
|
|
echo ""
|
|
echo "To pull a model:"
|
|
echo " docker exec -it ollama-hf-runner ollama pull llama2"
|
|
echo ""
|
|
echo "To run a model:"
|
|
echo " docker exec -it ollama-hf-runner ollama run llama2"
|
|
EOF
|
|
|
|
chmod +x docker_ollama.sh
|
|
|
|
echo ""
|
|
echo "=== Ollama Setup Complete! ==="
|
|
echo ""
|
|
echo "=== Available Commands ==="
|
|
echo "1. Manage HF models:"
|
|
echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
|
|
echo ""
|
|
echo "2. List available models:"
|
|
echo " ./manage_hf_models.sh -l"
|
|
echo ""
|
|
echo "3. Test performance:"
|
|
echo " ./test_performance.sh dialogpt-medium 5 20"
|
|
echo ""
|
|
echo "4. Docker integration:"
|
|
echo " ./docker_ollama.sh"
|
|
echo ""
|
|
echo "=== Quick Start ==="
|
|
echo "1. Download a model:"
|
|
echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
|
|
echo ""
|
|
echo "2. Run the model:"
|
|
echo " ollama run dialogpt-medium"
|
|
echo ""
|
|
echo "3. Test with API:"
|
|
echo " curl http://localhost:11434/api/generate -d '{\"model\": \"dialogpt-medium\", \"prompt\": \"Hello!\"}'"
|
|
echo ""
|
|
echo "=== Parallelism Features ==="
|
|
echo "- Multi-threading support"
|
|
echo "- GPU acceleration (if available)"
|
|
echo "- Concurrent request handling"
|
|
echo "- Batch processing"
|
|
echo "- Docker integration with GPU support"
|