#!/bin/bash # Advanced Hugging Face Model Runner with Parallelism # This script sets up a Docker-based solution that mimics Docker Model Runner functionality # Specifically designed for HF models not available in LM Studio set -e echo "=== Advanced Hugging Face Model Runner Setup ===" echo "Designed for models not available in LM Studio with parallelism support" echo "" # Create project directory PROJECT_DIR="$HOME/hf-model-runner" mkdir -p "$PROJECT_DIR" cd "$PROJECT_DIR" echo "Project directory: $PROJECT_DIR" # Create Docker Compose configuration with GPU support and parallelism cat > docker-compose.yml << 'EOF' version: '3.8' services: # Main model server with GPU support and parallelism llama-cpp-server: image: ghcr.io/ggerganov/llama.cpp:server container_name: hf-model-server ports: - "8080:8080" volumes: - ./models:/models - ./config:/config environment: - MODEL_PATH=/models - GPU_LAYERS=35 # Adjust based on your GPU memory - THREADS=8 # CPU threads for parallelism - BATCH_SIZE=512 # Batch size for parallel processing deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] command: > --model /models/current_model.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 35 --threads 8 --batch-size 512 --parallel --cont-batching --ctx-size 4096 --keep-alive 300 --log-format json restart: unless-stopped # Alternative: vLLM server for even better parallelism vllm-server: image: vllm/vllm-openai:latest container_name: hf-vllm-server ports: - "8000:8000" volumes: - ./models:/models environment: - CUDA_VISIBLE_DEVICES=0 deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] command: > --model /models/current_model --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --max-model-len 4096 --trust-remote-code restart: unless-stopped profiles: - vllm # Model management service model-manager: image: python:3.11-slim container_name: hf-model-manager volumes: - ./models:/models - ./scripts:/scripts - ./config:/config working_dir: /scripts command: python model_manager.py restart: unless-stopped depends_on: - llama-cpp-server EOF # Create model management script mkdir -p scripts cat > scripts/model_manager.py << 'EOF' #!/usr/bin/env python3 """ Hugging Face Model Manager Downloads and manages HF models with GGUF format support """ import os import json import requests import subprocess from pathlib import Path from huggingface_hub import hf_hub_download, list_repo_files import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class HFModelManager: def __init__(self, models_dir="/models"): self.models_dir = Path(models_dir) self.models_dir.mkdir(exist_ok=True) self.config_file = Path("/config/models.json") def list_available_models(self, repo_id): """List available GGUF models in a HF repository""" try: files = list_repo_files(repo_id) gguf_files = [f for f in files if f.endswith('.gguf')] return gguf_files except Exception as e: logger.error(f"Error listing models for {repo_id}: {e}") return [] def download_model(self, repo_id, filename=None): """Download a GGUF model from Hugging Face""" try: if filename is None: # Get the largest GGUF file files = self.list_available_models(repo_id) if not files: raise ValueError(f"No GGUF files found in {repo_id}") # Sort by size (largest first) - approximate by filename gguf_files = sorted(files, key=lambda x: x.lower(), reverse=True) filename = gguf_files[0] logger.info(f"Auto-selected model: {filename}") logger.info(f"Downloading {repo_id}/{filename}...") # Download the model model_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=self.models_dir, local_dir_use_symlinks=False ) # Create symlink for current model current_model_path = self.models_dir / "current_model.gguf" if current_model_path.exists(): current_model_path.unlink() current_model_path.symlink_to(Path(model_path).name) logger.info(f"Model downloaded to: {model_path}") logger.info(f"Current model symlink: {current_model_path}") return model_path except Exception as e: logger.error(f"Error downloading model: {e}") raise def get_model_info(self, repo_id): """Get information about a model repository""" try: # This would typically use HF API return { "repo_id": repo_id, "available_files": self.list_available_models(repo_id), "status": "available" } except Exception as e: logger.error(f"Error getting model info: {e}") return None def main(): manager = HFModelManager() # Example: Download a specific model # You can modify this to download any HF model repo_id = "microsoft/DialoGPT-medium" # Example model print(f"Managing models in: {manager.models_dir}") print(f"Available models: {manager.list_available_models(repo_id)}") # Uncomment to download a model: # manager.download_model(repo_id) if __name__ == "__main__": main() EOF # Create configuration directory mkdir -p config cat > config/models.json << 'EOF' { "available_models": { "microsoft/DialoGPT-medium": { "description": "Microsoft DialoGPT Medium", "size": "345M", "format": "gguf" }, "microsoft/DialoGPT-large": { "description": "Microsoft DialoGPT Large", "size": "774M", "format": "gguf" } }, "current_model": null, "settings": { "gpu_layers": 35, "threads": 8, "batch_size": 512, "context_size": 4096 } } EOF # Create model download script cat > download_model.sh << 'EOF' #!/bin/bash # Download specific Hugging Face model # Usage: ./download_model.sh [filename] REPO_ID=${1:-"microsoft/DialoGPT-medium"} FILENAME=${2:-""} echo "=== Downloading Hugging Face Model ===" echo "Repository: $REPO_ID" echo "Filename: ${FILENAME:-"auto-select largest GGUF"}" echo "" # Install required Python packages pip install huggingface_hub transformers torch # Run the model manager to download the model docker-compose run --rm model-manager python -c " from model_manager import HFModelManager import sys manager = HFModelManager() try: if '$FILENAME': manager.download_model('$REPO_ID', '$FILENAME') else: manager.download_model('$REPO_ID') print('Model downloaded successfully!') except Exception as e: print(f'Error: {e}') sys.exit(1) " echo "" echo "=== Model Download Complete ===" echo "You can now start the server with: docker-compose up" EOF chmod +x download_model.sh # Create API test script cat > test_api.sh << 'EOF' #!/bin/bash # Test the model API # Usage: ./test_api.sh [prompt] PROMPT=${1:-"Hello, how are you?"} API_URL="http://localhost:8080/completion" echo "=== Testing Model API ===" echo "Prompt: $PROMPT" echo "API URL: $API_URL" echo "" # Test the API curl -X POST "$API_URL" \ -H "Content-Type: application/json" \ -d "{ \"prompt\": \"$PROMPT\", \"n_predict\": 100, \"temperature\": 0.7, \"top_p\": 0.9, \"stream\": false }" | jq '.' echo "" echo "=== API Test Complete ===" EOF chmod +x test_api.sh # Create startup script cat > start_server.sh << 'EOF' #!/bin/bash echo "=== Starting Hugging Face Model Server ===" echo "" # Check if NVIDIA GPU is available if command -v nvidia-smi &> /dev/null; then echo "NVIDIA GPU detected:" nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits echo "" echo "Starting with GPU acceleration..." docker-compose up llama-cpp-server else echo "No NVIDIA GPU detected, starting with CPU only..." # Modify docker-compose to remove GPU requirements sed 's/n-gpu-layers 35/n-gpu-layers 0/' docker-compose.yml > docker-compose-cpu.yml docker-compose -f docker-compose-cpu.yml up llama-cpp-server fi EOF chmod +x start_server.sh echo "" echo "=== Setup Complete! ===" echo "" echo "Project directory: $PROJECT_DIR" echo "" echo "=== Next Steps ===" echo "1. Download a model:" echo " ./download_model.sh microsoft/DialoGPT-medium" echo "" echo "2. Start the server:" echo " ./start_server.sh" echo "" echo "3. Test the API:" echo " ./test_api.sh 'Hello, how are you?'" echo "" echo "=== Available Commands ===" echo "- Download model: ./download_model.sh [filename]" echo "- Start server: ./start_server.sh" echo "- Test API: ./test_api.sh [prompt]" echo "- View logs: docker-compose logs -f llama-cpp-server" echo "- Stop server: docker-compose down" echo "" echo "=== Parallelism Features ===" echo "- GPU acceleration with NVIDIA support" echo "- Multi-threading for CPU processing" echo "- Batch processing for efficiency" echo "- Continuous batching for multiple requests" echo "" echo "=== OpenAI-Compatible API ===" echo "The server provides OpenAI-compatible endpoints:" echo "- POST /completion - Text completion" echo "- POST /chat/completions - Chat completions" echo "- GET /models - List available models"