Files
gogo2/setup_ollama_hf_runner.sh
2025-09-25 00:52:01 +03:00

309 lines
7.3 KiB
Bash

#!/bin/bash
# Ollama-based Hugging Face Model Runner
# Alternative solution with excellent parallelism and HF integration
set -e
echo "=== Ollama Hugging Face Model Runner Setup ==="
echo "High-performance alternative with excellent parallelism"
echo ""
# Install Ollama
if ! command -v ollama &> /dev/null; then
echo "Installing Ollama..."
curl -fsSL https://ollama.com/install.sh | sh
echo "Ollama installed successfully!"
else
echo "Ollama is already installed."
ollama --version
fi
# Start Ollama service
echo "Starting Ollama service..."
ollama serve &
OLLAMA_PID=$!
# Wait for service to start
echo "Waiting for Ollama to start..."
sleep 5
# Create model management script
cat > manage_hf_models.sh << 'EOF'
#!/bin/bash
# Hugging Face Model Manager for Ollama
# Downloads and manages HF models with Ollama
MODEL_NAME=""
REPO_ID=""
show_help() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " -r, --repo REPO_ID Hugging Face repository ID (e.g., microsoft/DialoGPT-medium)"
echo " -n, --name MODEL_NAME Local model name for Ollama"
echo " -l, --list List available models"
echo " -h, --help Show this help"
echo ""
echo "Examples:"
echo " $0 -r microsoft/DialoGPT-medium -n dialogpt-medium"
echo " $0 -r microsoft/DialoGPT-large -n dialogpt-large"
echo " $0 -l"
}
list_models() {
echo "=== Available Ollama Models ==="
ollama list
echo ""
echo "=== Popular Hugging Face Models Compatible with Ollama ==="
echo "- microsoft/DialoGPT-medium"
echo "- microsoft/DialoGPT-large"
echo "- microsoft/DialoGPT-small"
echo "- facebook/blenderbot-400M-distill"
echo "- facebook/blenderbot-1B-distill"
echo "- facebook/blenderbot-3B"
echo "- EleutherAI/gpt-neo-125M"
echo "- EleutherAI/gpt-neo-1.3B"
echo "- EleutherAI/gpt-neo-2.7B"
}
download_model() {
if [[ -z "$REPO_ID" || -z "$MODEL_NAME" ]]; then
echo "Error: Both repository ID and model name are required"
show_help
exit 1
fi
echo "=== Downloading Hugging Face Model ==="
echo "Repository: $REPO_ID"
echo "Local name: $MODEL_NAME"
echo ""
# Create Modelfile for the HF model
cat > Modelfile << MODELFILE
FROM $REPO_ID
# Set parameters for better performance
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_ctx 4096
# Enable parallelism
PARAMETER num_thread 8
PARAMETER num_gpu 1
MODELFILE
echo "Created Modelfile for $MODEL_NAME"
echo "Pulling model from Hugging Face..."
# Pull the model
ollama create "$MODEL_NAME" -f Modelfile
echo "Model $MODEL_NAME created successfully!"
echo ""
echo "You can now run: ollama run $MODEL_NAME"
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-r|--repo)
REPO_ID="$2"
shift 2
;;
-n|--name)
MODEL_NAME="$2"
shift 2
;;
-l|--list)
list_models
exit 0
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done
# If no arguments provided, show help
if [[ $# -eq 0 ]]; then
show_help
exit 0
fi
# Download model if both parameters provided
if [[ -n "$REPO_ID" && -n "$MODEL_NAME" ]]; then
download_model
fi
EOF
chmod +x manage_hf_models.sh
# Create performance test script
cat > test_performance.sh << 'EOF'
#!/bin/bash
# Performance test for Ollama models
# Tests parallelism and throughput
MODEL_NAME=${1:-"dialogpt-medium"}
CONCURRENT_REQUESTS=${2:-5}
TOTAL_REQUESTS=${3:-20}
echo "=== Ollama Performance Test ==="
echo "Model: $MODEL_NAME"
echo "Concurrent requests: $CONCURRENT_REQUESTS"
echo "Total requests: $TOTAL_REQUESTS"
echo ""
# Test function
test_request() {
local request_id=$1
local prompt="Test prompt $request_id: What is the meaning of life?"
echo "Starting request $request_id..."
start_time=$(date +%s.%N)
response=$(ollama run "$MODEL_NAME" "$prompt" 2>/dev/null)
end_time=$(date +%s.%N)
duration=$(echo "$end_time - $start_time" | bc)
echo "Request $request_id completed in ${duration}s"
echo "$duration"
}
# Run concurrent tests
echo "Starting performance test..."
start_time=$(date +%s.%N)
# Create array to store PIDs
pids=()
# Launch concurrent requests
for i in $(seq 1 $TOTAL_REQUESTS); do
test_request $i &
pids+=($!)
# Limit concurrent requests
if (( i % CONCURRENT_REQUESTS == 0 )); then
# Wait for current batch to complete
for pid in "${pids[@]}"; do
wait $pid
done
pids=()
fi
done
# Wait for remaining requests
for pid in "${pids[@]}"; do
wait $pid
done
end_time=$(date +%s.%N)
total_duration=$(echo "$end_time - $start_time" | bc)
echo ""
echo "=== Performance Test Results ==="
echo "Total time: ${total_duration}s"
echo "Requests per second: $(echo "scale=2; $TOTAL_REQUESTS / $total_duration" | bc)"
echo "Average time per request: $(echo "scale=2; $total_duration / $TOTAL_REQUESTS" | bc)s"
EOF
chmod +x test_performance.sh
# Create Docker integration script
cat > docker_ollama.sh << 'EOF'
#!/bin/bash
# Docker integration for Ollama
# Run Ollama in Docker with GPU support
echo "=== Docker Ollama Setup ==="
echo ""
# Create Docker Compose for Ollama
cat > docker-compose-ollama.yml << 'COMPOSE'
version: '3.8'
services:
ollama:
image: ollama/ollama:latest
container_name: ollama-hf-runner
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
environment:
- OLLAMA_HOST=0.0.0.0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
command: serve
volumes:
ollama_data:
COMPOSE
echo "Created Docker Compose configuration"
echo ""
echo "To start Ollama in Docker:"
echo " docker-compose -f docker-compose-ollama.yml up -d"
echo ""
echo "To pull a model:"
echo " docker exec -it ollama-hf-runner ollama pull llama2"
echo ""
echo "To run a model:"
echo " docker exec -it ollama-hf-runner ollama run llama2"
EOF
chmod +x docker_ollama.sh
echo ""
echo "=== Ollama Setup Complete! ==="
echo ""
echo "=== Available Commands ==="
echo "1. Manage HF models:"
echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
echo ""
echo "2. List available models:"
echo " ./manage_hf_models.sh -l"
echo ""
echo "3. Test performance:"
echo " ./test_performance.sh dialogpt-medium 5 20"
echo ""
echo "4. Docker integration:"
echo " ./docker_ollama.sh"
echo ""
echo "=== Quick Start ==="
echo "1. Download a model:"
echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium"
echo ""
echo "2. Run the model:"
echo " ollama run dialogpt-medium"
echo ""
echo "3. Test with API:"
echo " curl http://localhost:11434/api/generate -d '{\"model\": \"dialogpt-medium\", \"prompt\": \"Hello!\"}'"
echo ""
echo "=== Parallelism Features ==="
echo "- Multi-threading support"
echo "- GPU acceleration (if available)"
echo "- Concurrent request handling"
echo "- Batch processing"
echo "- Docker integration with GPU support"