#!/bin/bash # Ollama-based Hugging Face Model Runner # Alternative solution with excellent parallelism and HF integration set -e echo "=== Ollama Hugging Face Model Runner Setup ===" echo "High-performance alternative with excellent parallelism" echo "" # Install Ollama if ! command -v ollama &> /dev/null; then echo "Installing Ollama..." curl -fsSL https://ollama.com/install.sh | sh echo "Ollama installed successfully!" else echo "Ollama is already installed." ollama --version fi # Start Ollama service echo "Starting Ollama service..." ollama serve & OLLAMA_PID=$! # Wait for service to start echo "Waiting for Ollama to start..." sleep 5 # Create model management script cat > manage_hf_models.sh << 'EOF' #!/bin/bash # Hugging Face Model Manager for Ollama # Downloads and manages HF models with Ollama MODEL_NAME="" REPO_ID="" show_help() { echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " -r, --repo REPO_ID Hugging Face repository ID (e.g., microsoft/DialoGPT-medium)" echo " -n, --name MODEL_NAME Local model name for Ollama" echo " -l, --list List available models" echo " -h, --help Show this help" echo "" echo "Examples:" echo " $0 -r microsoft/DialoGPT-medium -n dialogpt-medium" echo " $0 -r microsoft/DialoGPT-large -n dialogpt-large" echo " $0 -l" } list_models() { echo "=== Available Ollama Models ===" ollama list echo "" echo "=== Popular Hugging Face Models Compatible with Ollama ===" echo "- microsoft/DialoGPT-medium" echo "- microsoft/DialoGPT-large" echo "- microsoft/DialoGPT-small" echo "- facebook/blenderbot-400M-distill" echo "- facebook/blenderbot-1B-distill" echo "- facebook/blenderbot-3B" echo "- EleutherAI/gpt-neo-125M" echo "- EleutherAI/gpt-neo-1.3B" echo "- EleutherAI/gpt-neo-2.7B" } download_model() { if [[ -z "$REPO_ID" || -z "$MODEL_NAME" ]]; then echo "Error: Both repository ID and model name are required" show_help exit 1 fi echo "=== Downloading Hugging Face Model ===" echo "Repository: $REPO_ID" echo "Local name: $MODEL_NAME" echo "" # Create Modelfile for the HF model cat > Modelfile << MODELFILE FROM $REPO_ID # Set parameters for better performance PARAMETER temperature 0.7 PARAMETER top_p 0.9 PARAMETER top_k 40 PARAMETER repeat_penalty 1.1 PARAMETER num_ctx 4096 # Enable parallelism PARAMETER num_thread 8 PARAMETER num_gpu 1 MODELFILE echo "Created Modelfile for $MODEL_NAME" echo "Pulling model from Hugging Face..." # Pull the model ollama create "$MODEL_NAME" -f Modelfile echo "Model $MODEL_NAME created successfully!" echo "" echo "You can now run: ollama run $MODEL_NAME" } # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in -r|--repo) REPO_ID="$2" shift 2 ;; -n|--name) MODEL_NAME="$2" shift 2 ;; -l|--list) list_models exit 0 ;; -h|--help) show_help exit 0 ;; *) echo "Unknown option: $1" show_help exit 1 ;; esac done # If no arguments provided, show help if [[ $# -eq 0 ]]; then show_help exit 0 fi # Download model if both parameters provided if [[ -n "$REPO_ID" && -n "$MODEL_NAME" ]]; then download_model fi EOF chmod +x manage_hf_models.sh # Create performance test script cat > test_performance.sh << 'EOF' #!/bin/bash # Performance test for Ollama models # Tests parallelism and throughput MODEL_NAME=${1:-"dialogpt-medium"} CONCURRENT_REQUESTS=${2:-5} TOTAL_REQUESTS=${3:-20} echo "=== Ollama Performance Test ===" echo "Model: $MODEL_NAME" echo "Concurrent requests: $CONCURRENT_REQUESTS" echo "Total requests: $TOTAL_REQUESTS" echo "" # Test function test_request() { local request_id=$1 local prompt="Test prompt $request_id: What is the meaning of life?" echo "Starting request $request_id..." start_time=$(date +%s.%N) response=$(ollama run "$MODEL_NAME" "$prompt" 2>/dev/null) end_time=$(date +%s.%N) duration=$(echo "$end_time - $start_time" | bc) echo "Request $request_id completed in ${duration}s" echo "$duration" } # Run concurrent tests echo "Starting performance test..." start_time=$(date +%s.%N) # Create array to store PIDs pids=() # Launch concurrent requests for i in $(seq 1 $TOTAL_REQUESTS); do test_request $i & pids+=($!) # Limit concurrent requests if (( i % CONCURRENT_REQUESTS == 0 )); then # Wait for current batch to complete for pid in "${pids[@]}"; do wait $pid done pids=() fi done # Wait for remaining requests for pid in "${pids[@]}"; do wait $pid done end_time=$(date +%s.%N) total_duration=$(echo "$end_time - $start_time" | bc) echo "" echo "=== Performance Test Results ===" echo "Total time: ${total_duration}s" echo "Requests per second: $(echo "scale=2; $TOTAL_REQUESTS / $total_duration" | bc)" echo "Average time per request: $(echo "scale=2; $total_duration / $TOTAL_REQUESTS" | bc)s" EOF chmod +x test_performance.sh # Create Docker integration script cat > docker_ollama.sh << 'EOF' #!/bin/bash # Docker integration for Ollama # Run Ollama in Docker with GPU support echo "=== Docker Ollama Setup ===" echo "" # Create Docker Compose for Ollama cat > docker-compose-ollama.yml << 'COMPOSE' version: '3.8' services: ollama: image: ollama/ollama:latest container_name: ollama-hf-runner ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_HOST=0.0.0.0 deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped command: serve volumes: ollama_data: COMPOSE echo "Created Docker Compose configuration" echo "" echo "To start Ollama in Docker:" echo " docker-compose -f docker-compose-ollama.yml up -d" echo "" echo "To pull a model:" echo " docker exec -it ollama-hf-runner ollama pull llama2" echo "" echo "To run a model:" echo " docker exec -it ollama-hf-runner ollama run llama2" EOF chmod +x docker_ollama.sh echo "" echo "=== Ollama Setup Complete! ===" echo "" echo "=== Available Commands ===" echo "1. Manage HF models:" echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium" echo "" echo "2. List available models:" echo " ./manage_hf_models.sh -l" echo "" echo "3. Test performance:" echo " ./test_performance.sh dialogpt-medium 5 20" echo "" echo "4. Docker integration:" echo " ./docker_ollama.sh" echo "" echo "=== Quick Start ===" echo "1. Download a model:" echo " ./manage_hf_models.sh -r microsoft/DialoGPT-medium -n dialogpt-medium" echo "" echo "2. Run the model:" echo " ollama run dialogpt-medium" echo "" echo "3. Test with API:" echo " curl http://localhost:11434/api/generate -d '{\"model\": \"dialogpt-medium\", \"prompt\": \"Hello!\"}'" echo "" echo "=== Parallelism Features ===" echo "- Multi-threading support" echo "- GPU acceleration (if available)" echo "- Concurrent request handling" echo "- Batch processing" echo "- Docker integration with GPU support"