367 lines
9.8 KiB
Bash
367 lines
9.8 KiB
Bash
#!/bin/bash
|
|
|
|
# Advanced Hugging Face Model Runner with Parallelism
|
|
# This script sets up a Docker-based solution that mimics Docker Model Runner functionality
|
|
# Specifically designed for HF models not available in LM Studio
|
|
|
|
set -e
|
|
|
|
echo "=== Advanced Hugging Face Model Runner Setup ==="
|
|
echo "Designed for models not available in LM Studio with parallelism support"
|
|
echo ""
|
|
|
|
# Create project directory
|
|
PROJECT_DIR="$HOME/hf-model-runner"
|
|
mkdir -p "$PROJECT_DIR"
|
|
cd "$PROJECT_DIR"
|
|
|
|
echo "Project directory: $PROJECT_DIR"
|
|
|
|
# Create Docker Compose configuration with GPU support and parallelism
|
|
cat > docker-compose.yml << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
# Main model server with GPU support and parallelism
|
|
llama-cpp-server:
|
|
image: ghcr.io/ggerganov/llama.cpp:server
|
|
container_name: hf-model-server
|
|
ports:
|
|
- "8080:8080"
|
|
volumes:
|
|
- ./models:/models
|
|
- ./config:/config
|
|
environment:
|
|
- MODEL_PATH=/models
|
|
- GPU_LAYERS=35 # Adjust based on your GPU memory
|
|
- THREADS=8 # CPU threads for parallelism
|
|
- BATCH_SIZE=512 # Batch size for parallel processing
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
command: >
|
|
--model /models/current_model.gguf
|
|
--host 0.0.0.0
|
|
--port 8080
|
|
--n-gpu-layers 35
|
|
--threads 8
|
|
--batch-size 512
|
|
--parallel
|
|
--cont-batching
|
|
--ctx-size 4096
|
|
--keep-alive 300
|
|
--log-format json
|
|
restart: unless-stopped
|
|
|
|
# Alternative: vLLM server for even better parallelism
|
|
vllm-server:
|
|
image: vllm/vllm-openai:latest
|
|
container_name: hf-vllm-server
|
|
ports:
|
|
- "8000:8000"
|
|
volumes:
|
|
- ./models:/models
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
command: >
|
|
--model /models/current_model
|
|
--host 0.0.0.0
|
|
--port 8000
|
|
--tensor-parallel-size 1
|
|
--gpu-memory-utilization 0.9
|
|
--max-model-len 4096
|
|
--trust-remote-code
|
|
restart: unless-stopped
|
|
profiles:
|
|
- vllm
|
|
|
|
# Model management service
|
|
model-manager:
|
|
image: python:3.11-slim
|
|
container_name: hf-model-manager
|
|
volumes:
|
|
- ./models:/models
|
|
- ./scripts:/scripts
|
|
- ./config:/config
|
|
working_dir: /scripts
|
|
command: python model_manager.py
|
|
restart: unless-stopped
|
|
depends_on:
|
|
- llama-cpp-server
|
|
|
|
EOF
|
|
|
|
# Create model management script
|
|
mkdir -p scripts
|
|
cat > scripts/model_manager.py << 'EOF'
|
|
#!/usr/bin/env python3
|
|
"""
|
|
Hugging Face Model Manager
|
|
Downloads and manages HF models with GGUF format support
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import requests
|
|
import subprocess
|
|
from pathlib import Path
|
|
from huggingface_hub import hf_hub_download, list_repo_files
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class HFModelManager:
|
|
def __init__(self, models_dir="/models"):
|
|
self.models_dir = Path(models_dir)
|
|
self.models_dir.mkdir(exist_ok=True)
|
|
self.config_file = Path("/config/models.json")
|
|
|
|
def list_available_models(self, repo_id):
|
|
"""List available GGUF models in a HF repository"""
|
|
try:
|
|
files = list_repo_files(repo_id)
|
|
gguf_files = [f for f in files if f.endswith('.gguf')]
|
|
return gguf_files
|
|
except Exception as e:
|
|
logger.error(f"Error listing models for {repo_id}: {e}")
|
|
return []
|
|
|
|
def download_model(self, repo_id, filename=None):
|
|
"""Download a GGUF model from Hugging Face"""
|
|
try:
|
|
if filename is None:
|
|
# Get the largest GGUF file
|
|
files = self.list_available_models(repo_id)
|
|
if not files:
|
|
raise ValueError(f"No GGUF files found in {repo_id}")
|
|
|
|
# Sort by size (largest first) - approximate by filename
|
|
gguf_files = sorted(files, key=lambda x: x.lower(), reverse=True)
|
|
filename = gguf_files[0]
|
|
logger.info(f"Auto-selected model: {filename}")
|
|
|
|
logger.info(f"Downloading {repo_id}/{filename}...")
|
|
|
|
# Download the model
|
|
model_path = hf_hub_download(
|
|
repo_id=repo_id,
|
|
filename=filename,
|
|
local_dir=self.models_dir,
|
|
local_dir_use_symlinks=False
|
|
)
|
|
|
|
# Create symlink for current model
|
|
current_model_path = self.models_dir / "current_model.gguf"
|
|
if current_model_path.exists():
|
|
current_model_path.unlink()
|
|
current_model_path.symlink_to(Path(model_path).name)
|
|
|
|
logger.info(f"Model downloaded to: {model_path}")
|
|
logger.info(f"Current model symlink: {current_model_path}")
|
|
|
|
return model_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading model: {e}")
|
|
raise
|
|
|
|
def get_model_info(self, repo_id):
|
|
"""Get information about a model repository"""
|
|
try:
|
|
# This would typically use HF API
|
|
return {
|
|
"repo_id": repo_id,
|
|
"available_files": self.list_available_models(repo_id),
|
|
"status": "available"
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting model info: {e}")
|
|
return None
|
|
|
|
def main():
|
|
manager = HFModelManager()
|
|
|
|
# Example: Download a specific model
|
|
# You can modify this to download any HF model
|
|
repo_id = "microsoft/DialoGPT-medium" # Example model
|
|
|
|
print(f"Managing models in: {manager.models_dir}")
|
|
print(f"Available models: {manager.list_available_models(repo_id)}")
|
|
|
|
# Uncomment to download a model:
|
|
# manager.download_model(repo_id)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
EOF
|
|
|
|
# Create configuration directory
|
|
mkdir -p config
|
|
cat > config/models.json << 'EOF'
|
|
{
|
|
"available_models": {
|
|
"microsoft/DialoGPT-medium": {
|
|
"description": "Microsoft DialoGPT Medium",
|
|
"size": "345M",
|
|
"format": "gguf"
|
|
},
|
|
"microsoft/DialoGPT-large": {
|
|
"description": "Microsoft DialoGPT Large",
|
|
"size": "774M",
|
|
"format": "gguf"
|
|
}
|
|
},
|
|
"current_model": null,
|
|
"settings": {
|
|
"gpu_layers": 35,
|
|
"threads": 8,
|
|
"batch_size": 512,
|
|
"context_size": 4096
|
|
}
|
|
}
|
|
EOF
|
|
|
|
# Create model download script
|
|
cat > download_model.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
# Download specific Hugging Face model
|
|
# Usage: ./download_model.sh <repo_id> [filename]
|
|
|
|
REPO_ID=${1:-"microsoft/DialoGPT-medium"}
|
|
FILENAME=${2:-""}
|
|
|
|
echo "=== Downloading Hugging Face Model ==="
|
|
echo "Repository: $REPO_ID"
|
|
echo "Filename: ${FILENAME:-"auto-select largest GGUF"}"
|
|
echo ""
|
|
|
|
# Install required Python packages
|
|
pip install huggingface_hub transformers torch
|
|
|
|
# Run the model manager to download the model
|
|
docker-compose run --rm model-manager python -c "
|
|
from model_manager import HFModelManager
|
|
import sys
|
|
|
|
manager = HFModelManager()
|
|
try:
|
|
if '$FILENAME':
|
|
manager.download_model('$REPO_ID', '$FILENAME')
|
|
else:
|
|
manager.download_model('$REPO_ID')
|
|
print('Model downloaded successfully!')
|
|
except Exception as e:
|
|
print(f'Error: {e}')
|
|
sys.exit(1)
|
|
"
|
|
|
|
echo ""
|
|
echo "=== Model Download Complete ==="
|
|
echo "You can now start the server with: docker-compose up"
|
|
EOF
|
|
|
|
chmod +x download_model.sh
|
|
|
|
# Create API test script
|
|
cat > test_api.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
# Test the model API
|
|
# Usage: ./test_api.sh [prompt]
|
|
|
|
PROMPT=${1:-"Hello, how are you?"}
|
|
API_URL="http://localhost:8080/completion"
|
|
|
|
echo "=== Testing Model API ==="
|
|
echo "Prompt: $PROMPT"
|
|
echo "API URL: $API_URL"
|
|
echo ""
|
|
|
|
# Test the API
|
|
curl -X POST "$API_URL" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{
|
|
\"prompt\": \"$PROMPT\",
|
|
\"n_predict\": 100,
|
|
\"temperature\": 0.7,
|
|
\"top_p\": 0.9,
|
|
\"stream\": false
|
|
}" | jq '.'
|
|
|
|
echo ""
|
|
echo "=== API Test Complete ==="
|
|
EOF
|
|
|
|
chmod +x test_api.sh
|
|
|
|
# Create startup script
|
|
cat > start_server.sh << 'EOF'
|
|
#!/bin/bash
|
|
|
|
echo "=== Starting Hugging Face Model Server ==="
|
|
echo ""
|
|
|
|
# Check if NVIDIA GPU is available
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
echo "NVIDIA GPU detected:"
|
|
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits
|
|
echo ""
|
|
echo "Starting with GPU acceleration..."
|
|
docker-compose up llama-cpp-server
|
|
else
|
|
echo "No NVIDIA GPU detected, starting with CPU only..."
|
|
# Modify docker-compose to remove GPU requirements
|
|
sed 's/n-gpu-layers 35/n-gpu-layers 0/' docker-compose.yml > docker-compose-cpu.yml
|
|
docker-compose -f docker-compose-cpu.yml up llama-cpp-server
|
|
fi
|
|
EOF
|
|
|
|
chmod +x start_server.sh
|
|
|
|
echo ""
|
|
echo "=== Setup Complete! ==="
|
|
echo ""
|
|
echo "Project directory: $PROJECT_DIR"
|
|
echo ""
|
|
echo "=== Next Steps ==="
|
|
echo "1. Download a model:"
|
|
echo " ./download_model.sh microsoft/DialoGPT-medium"
|
|
echo ""
|
|
echo "2. Start the server:"
|
|
echo " ./start_server.sh"
|
|
echo ""
|
|
echo "3. Test the API:"
|
|
echo " ./test_api.sh 'Hello, how are you?'"
|
|
echo ""
|
|
echo "=== Available Commands ==="
|
|
echo "- Download model: ./download_model.sh <repo_id> [filename]"
|
|
echo "- Start server: ./start_server.sh"
|
|
echo "- Test API: ./test_api.sh [prompt]"
|
|
echo "- View logs: docker-compose logs -f llama-cpp-server"
|
|
echo "- Stop server: docker-compose down"
|
|
echo ""
|
|
echo "=== Parallelism Features ==="
|
|
echo "- GPU acceleration with NVIDIA support"
|
|
echo "- Multi-threading for CPU processing"
|
|
echo "- Batch processing for efficiency"
|
|
echo "- Continuous batching for multiple requests"
|
|
echo ""
|
|
echo "=== OpenAI-Compatible API ==="
|
|
echo "The server provides OpenAI-compatible endpoints:"
|
|
echo "- POST /completion - Text completion"
|
|
echo "- POST /chat/completions - Chat completions"
|
|
echo "- GET /models - List available models"
|