gogo2/setup_advanced_hf_runner.sh

#!/bin/bash

# Advanced Hugging Face Model Runner with Parallelism
# This script sets up a Docker-based solution that mimics Docker Model Runner functionality
# Specifically designed for HF models not available in LM Studio

set -e

echo "=== Advanced Hugging Face Model Runner Setup ==="
echo "Designed for models not available in LM Studio with parallelism support"
echo ""

# Create project directory
PROJECT_DIR="$HOME/hf-model-runner"
mkdir -p "$PROJECT_DIR"
cd "$PROJECT_DIR"

echo "Project directory: $PROJECT_DIR"

# Create Docker Compose configuration with GPU support and parallelism
cat > docker-compose.yml << 'EOF'
version: '3.8'

services:
  # Main model server with GPU support and parallelism
  llama-cpp-server:
    image: ghcr.io/ggerganov/llama.cpp:server
    container_name: hf-model-server
    ports:
      - "8080:8080"
    volumes:
      - ./models:/models
      - ./config:/config
    environment:
      - MODEL_PATH=/models
      - GPU_LAYERS=35  # Adjust based on your GPU memory
      - THREADS=8      # CPU threads for parallelism
      - BATCH_SIZE=512 # Batch size for parallel processing
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command: >
      --model /models/current_model.gguf
      --host 0.0.0.0
      --port 8080
      --n-gpu-layers 35
      --threads 8
      --batch-size 512
      --parallel
      --cont-batching
      --ctx-size 4096
      --keep-alive 300
      --log-format json
    restart: unless-stopped

  # Alternative: vLLM server for even better parallelism
  vllm-server:
    image: vllm/vllm-openai:latest
    container_name: hf-vllm-server
    ports:
      - "8000:8000"
    volumes:
      - ./models:/models
    environment:
      - CUDA_VISIBLE_DEVICES=0
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command: >
      --model /models/current_model
      --host 0.0.0.0
      --port 8000
      --tensor-parallel-size 1
      --gpu-memory-utilization 0.9
      --max-model-len 4096
      --trust-remote-code
    restart: unless-stopped
    profiles:
      - vllm

  # Model management service
  model-manager:
    image: python:3.11-slim
    container_name: hf-model-manager
    volumes:
      - ./models:/models
      - ./scripts:/scripts
      - ./config:/config
    working_dir: /scripts
    command: python model_manager.py
    restart: unless-stopped
    depends_on:
      - llama-cpp-server

EOF

# Create model management script
mkdir -p scripts
cat > scripts/model_manager.py << 'EOF'
#!/usr/bin/env python3
"""
Hugging Face Model Manager
Downloads and manages HF models with GGUF format support
"""

import os
import json
import requests
import subprocess
from pathlib import Path
from huggingface_hub import hf_hub_download, list_repo_files
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HFModelManager:
    def __init__(self, models_dir="/models"):
        self.models_dir = Path(models_dir)
        self.models_dir.mkdir(exist_ok=True)
        self.config_file = Path("/config/models.json")

    def list_available_models(self, repo_id):
        """List available GGUF models in a HF repository"""
        try:
            files = list_repo_files(repo_id)
            gguf_files = [f for f in files if f.endswith('.gguf')]
            return gguf_files
        except Exception as e:
            logger.error(f"Error listing models for {repo_id}: {e}")
            return []

    def download_model(self, repo_id, filename=None):
        """Download a GGUF model from Hugging Face"""
        try:
            if filename is None:
                # Get the largest GGUF file
                files = self.list_available_models(repo_id)
                if not files:
                    raise ValueError(f"No GGUF files found in {repo_id}")

                # Sort by size (largest first) - approximate by filename
                gguf_files = sorted(files, key=lambda x: x.lower(), reverse=True)
                filename = gguf_files[0]
                logger.info(f"Auto-selected model: {filename}")

            logger.info(f"Downloading {repo_id}/{filename}...")

            # Download the model
            model_path = hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir=self.models_dir,
                local_dir_use_symlinks=False
            )

            # Create symlink for current model
            current_model_path = self.models_dir / "current_model.gguf"
            if current_model_path.exists():
                current_model_path.unlink()
            current_model_path.symlink_to(Path(model_path).name)

            logger.info(f"Model downloaded to: {model_path}")
            logger.info(f"Current model symlink: {current_model_path}")

            return model_path

        except Exception as e:
            logger.error(f"Error downloading model: {e}")
            raise

    def get_model_info(self, repo_id):
        """Get information about a model repository"""
        try:
            # This would typically use HF API
            return {
                "repo_id": repo_id,
                "available_files": self.list_available_models(repo_id),
                "status": "available"
            }
        except Exception as e:
            logger.error(f"Error getting model info: {e}")
            return None

def main():
    manager = HFModelManager()

    # Example: Download a specific model
    # You can modify this to download any HF model
    repo_id = "microsoft/DialoGPT-medium"  # Example model

    print(f"Managing models in: {manager.models_dir}")
    print(f"Available models: {manager.list_available_models(repo_id)}")

    # Uncomment to download a model:
    # manager.download_model(repo_id)

if __name__ == "__main__":
    main()
EOF

# Create configuration directory
mkdir -p config
cat > config/models.json << 'EOF'
{
  "available_models": {
    "microsoft/DialoGPT-medium": {
      "description": "Microsoft DialoGPT Medium",
      "size": "345M",
      "format": "gguf"
    },
    "microsoft/DialoGPT-large": {
      "description": "Microsoft DialoGPT Large",
      "size": "774M",
      "format": "gguf"
    }
  },
  "current_model": null,
  "settings": {
    "gpu_layers": 35,
    "threads": 8,
    "batch_size": 512,
    "context_size": 4096
  }
}
EOF

# Create model download script
cat > download_model.sh << 'EOF'
#!/bin/bash

# Download specific Hugging Face model
# Usage: ./download_model.sh <repo_id> [filename]

REPO_ID=${1:-"microsoft/DialoGPT-medium"}
FILENAME=${2:-""}

echo "=== Downloading Hugging Face Model ==="
echo "Repository: $REPO_ID"
echo "Filename: ${FILENAME:-"auto-select largest GGUF"}"
echo ""

# Install required Python packages
pip install huggingface_hub transformers torch

# Run the model manager to download the model
docker-compose run --rm model-manager python -c "
from model_manager import HFModelManager
import sys

manager = HFModelManager()
try:
    if '$FILENAME':
        manager.download_model('$REPO_ID', '$FILENAME')
    else:
        manager.download_model('$REPO_ID')
    print('Model downloaded successfully!')
except Exception as e:
    print(f'Error: {e}')
    sys.exit(1)
"

echo ""
echo "=== Model Download Complete ==="
echo "You can now start the server with: docker-compose up"
EOF

chmod +x download_model.sh

# Create API test script
cat > test_api.sh << 'EOF'
#!/bin/bash

# Test the model API
# Usage: ./test_api.sh [prompt]

PROMPT=${1:-"Hello, how are you?"}
API_URL="http://localhost:8080/completion"

echo "=== Testing Model API ==="
echo "Prompt: $PROMPT"
echo "API URL: $API_URL"
echo ""

# Test the API
curl -X POST "$API_URL" \
  -H "Content-Type: application/json" \
  -d "{
    \"prompt\": \"$PROMPT\",
    \"n_predict\": 100,
    \"temperature\": 0.7,
    \"top_p\": 0.9,
    \"stream\": false
  }" | jq '.'

echo ""
echo "=== API Test Complete ==="
EOF

chmod +x test_api.sh

# Create startup script
cat > start_server.sh << 'EOF'
#!/bin/bash

echo "=== Starting Hugging Face Model Server ==="
echo ""

# Check if NVIDIA GPU is available
if command -v nvidia-smi &> /dev/null; then
    echo "NVIDIA GPU detected:"
    nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits
    echo ""
    echo "Starting with GPU acceleration..."
    docker-compose up llama-cpp-server
else
    echo "No NVIDIA GPU detected, starting with CPU only..."
    # Modify docker-compose to remove GPU requirements
    sed 's/n-gpu-layers 35/n-gpu-layers 0/' docker-compose.yml > docker-compose-cpu.yml
    docker-compose -f docker-compose-cpu.yml up llama-cpp-server
fi
EOF

chmod +x start_server.sh

echo ""
echo "=== Setup Complete! ==="
echo ""
echo "Project directory: $PROJECT_DIR"
echo ""
echo "=== Next Steps ==="
echo "1. Download a model:"
echo "   ./download_model.sh microsoft/DialoGPT-medium"
echo ""
echo "2. Start the server:"
echo "   ./start_server.sh"
echo ""
echo "3. Test the API:"
echo "   ./test_api.sh 'Hello, how are you?'"
echo ""
echo "=== Available Commands ==="
echo "- Download model: ./download_model.sh <repo_id> [filename]"
echo "- Start server: ./start_server.sh"
echo "- Test API: ./test_api.sh [prompt]"
echo "- View logs: docker-compose logs -f llama-cpp-server"
echo "- Stop server: docker-compose down"
echo ""
echo "=== Parallelism Features ==="
echo "- GPU acceleration with NVIDIA support"
echo "- Multi-threading for CPU processing"
echo "- Batch processing for efficiency"
echo "- Continuous batching for multiple requests"
echo ""
echo "=== OpenAI-Compatible API ==="
echo "The server provides OpenAI-compatible endpoints:"
echo "- POST /completion - Text completion"
echo "- POST /chat/completions - Chat completions"
echo "- GET /models - List available models"