Files
gogo2/setup_advanced_hf_runner.sh
2025-09-25 00:52:01 +03:00

367 lines
9.8 KiB
Bash

#!/bin/bash
# Advanced Hugging Face Model Runner with Parallelism
# This script sets up a Docker-based solution that mimics Docker Model Runner functionality
# Specifically designed for HF models not available in LM Studio
set -e
echo "=== Advanced Hugging Face Model Runner Setup ==="
echo "Designed for models not available in LM Studio with parallelism support"
echo ""
# Create project directory
PROJECT_DIR="$HOME/hf-model-runner"
mkdir -p "$PROJECT_DIR"
cd "$PROJECT_DIR"
echo "Project directory: $PROJECT_DIR"
# Create Docker Compose configuration with GPU support and parallelism
cat > docker-compose.yml << 'EOF'
version: '3.8'
services:
# Main model server with GPU support and parallelism
llama-cpp-server:
image: ghcr.io/ggerganov/llama.cpp:server
container_name: hf-model-server
ports:
- "8080:8080"
volumes:
- ./models:/models
- ./config:/config
environment:
- MODEL_PATH=/models
- GPU_LAYERS=35 # Adjust based on your GPU memory
- THREADS=8 # CPU threads for parallelism
- BATCH_SIZE=512 # Batch size for parallel processing
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command: >
--model /models/current_model.gguf
--host 0.0.0.0
--port 8080
--n-gpu-layers 35
--threads 8
--batch-size 512
--parallel
--cont-batching
--ctx-size 4096
--keep-alive 300
--log-format json
restart: unless-stopped
# Alternative: vLLM server for even better parallelism
vllm-server:
image: vllm/vllm-openai:latest
container_name: hf-vllm-server
ports:
- "8000:8000"
volumes:
- ./models:/models
environment:
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command: >
--model /models/current_model
--host 0.0.0.0
--port 8000
--tensor-parallel-size 1
--gpu-memory-utilization 0.9
--max-model-len 4096
--trust-remote-code
restart: unless-stopped
profiles:
- vllm
# Model management service
model-manager:
image: python:3.11-slim
container_name: hf-model-manager
volumes:
- ./models:/models
- ./scripts:/scripts
- ./config:/config
working_dir: /scripts
command: python model_manager.py
restart: unless-stopped
depends_on:
- llama-cpp-server
EOF
# Create model management script
mkdir -p scripts
cat > scripts/model_manager.py << 'EOF'
#!/usr/bin/env python3
"""
Hugging Face Model Manager
Downloads and manages HF models with GGUF format support
"""
import os
import json
import requests
import subprocess
from pathlib import Path
from huggingface_hub import hf_hub_download, list_repo_files
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HFModelManager:
def __init__(self, models_dir="/models"):
self.models_dir = Path(models_dir)
self.models_dir.mkdir(exist_ok=True)
self.config_file = Path("/config/models.json")
def list_available_models(self, repo_id):
"""List available GGUF models in a HF repository"""
try:
files = list_repo_files(repo_id)
gguf_files = [f for f in files if f.endswith('.gguf')]
return gguf_files
except Exception as e:
logger.error(f"Error listing models for {repo_id}: {e}")
return []
def download_model(self, repo_id, filename=None):
"""Download a GGUF model from Hugging Face"""
try:
if filename is None:
# Get the largest GGUF file
files = self.list_available_models(repo_id)
if not files:
raise ValueError(f"No GGUF files found in {repo_id}")
# Sort by size (largest first) - approximate by filename
gguf_files = sorted(files, key=lambda x: x.lower(), reverse=True)
filename = gguf_files[0]
logger.info(f"Auto-selected model: {filename}")
logger.info(f"Downloading {repo_id}/{filename}...")
# Download the model
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=self.models_dir,
local_dir_use_symlinks=False
)
# Create symlink for current model
current_model_path = self.models_dir / "current_model.gguf"
if current_model_path.exists():
current_model_path.unlink()
current_model_path.symlink_to(Path(model_path).name)
logger.info(f"Model downloaded to: {model_path}")
logger.info(f"Current model symlink: {current_model_path}")
return model_path
except Exception as e:
logger.error(f"Error downloading model: {e}")
raise
def get_model_info(self, repo_id):
"""Get information about a model repository"""
try:
# This would typically use HF API
return {
"repo_id": repo_id,
"available_files": self.list_available_models(repo_id),
"status": "available"
}
except Exception as e:
logger.error(f"Error getting model info: {e}")
return None
def main():
manager = HFModelManager()
# Example: Download a specific model
# You can modify this to download any HF model
repo_id = "microsoft/DialoGPT-medium" # Example model
print(f"Managing models in: {manager.models_dir}")
print(f"Available models: {manager.list_available_models(repo_id)}")
# Uncomment to download a model:
# manager.download_model(repo_id)
if __name__ == "__main__":
main()
EOF
# Create configuration directory
mkdir -p config
cat > config/models.json << 'EOF'
{
"available_models": {
"microsoft/DialoGPT-medium": {
"description": "Microsoft DialoGPT Medium",
"size": "345M",
"format": "gguf"
},
"microsoft/DialoGPT-large": {
"description": "Microsoft DialoGPT Large",
"size": "774M",
"format": "gguf"
}
},
"current_model": null,
"settings": {
"gpu_layers": 35,
"threads": 8,
"batch_size": 512,
"context_size": 4096
}
}
EOF
# Create model download script
cat > download_model.sh << 'EOF'
#!/bin/bash
# Download specific Hugging Face model
# Usage: ./download_model.sh <repo_id> [filename]
REPO_ID=${1:-"microsoft/DialoGPT-medium"}
FILENAME=${2:-""}
echo "=== Downloading Hugging Face Model ==="
echo "Repository: $REPO_ID"
echo "Filename: ${FILENAME:-"auto-select largest GGUF"}"
echo ""
# Install required Python packages
pip install huggingface_hub transformers torch
# Run the model manager to download the model
docker-compose run --rm model-manager python -c "
from model_manager import HFModelManager
import sys
manager = HFModelManager()
try:
if '$FILENAME':
manager.download_model('$REPO_ID', '$FILENAME')
else:
manager.download_model('$REPO_ID')
print('Model downloaded successfully!')
except Exception as e:
print(f'Error: {e}')
sys.exit(1)
"
echo ""
echo "=== Model Download Complete ==="
echo "You can now start the server with: docker-compose up"
EOF
chmod +x download_model.sh
# Create API test script
cat > test_api.sh << 'EOF'
#!/bin/bash
# Test the model API
# Usage: ./test_api.sh [prompt]
PROMPT=${1:-"Hello, how are you?"}
API_URL="http://localhost:8080/completion"
echo "=== Testing Model API ==="
echo "Prompt: $PROMPT"
echo "API URL: $API_URL"
echo ""
# Test the API
curl -X POST "$API_URL" \
-H "Content-Type: application/json" \
-d "{
\"prompt\": \"$PROMPT\",
\"n_predict\": 100,
\"temperature\": 0.7,
\"top_p\": 0.9,
\"stream\": false
}" | jq '.'
echo ""
echo "=== API Test Complete ==="
EOF
chmod +x test_api.sh
# Create startup script
cat > start_server.sh << 'EOF'
#!/bin/bash
echo "=== Starting Hugging Face Model Server ==="
echo ""
# Check if NVIDIA GPU is available
if command -v nvidia-smi &> /dev/null; then
echo "NVIDIA GPU detected:"
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits
echo ""
echo "Starting with GPU acceleration..."
docker-compose up llama-cpp-server
else
echo "No NVIDIA GPU detected, starting with CPU only..."
# Modify docker-compose to remove GPU requirements
sed 's/n-gpu-layers 35/n-gpu-layers 0/' docker-compose.yml > docker-compose-cpu.yml
docker-compose -f docker-compose-cpu.yml up llama-cpp-server
fi
EOF
chmod +x start_server.sh
echo ""
echo "=== Setup Complete! ==="
echo ""
echo "Project directory: $PROJECT_DIR"
echo ""
echo "=== Next Steps ==="
echo "1. Download a model:"
echo " ./download_model.sh microsoft/DialoGPT-medium"
echo ""
echo "2. Start the server:"
echo " ./start_server.sh"
echo ""
echo "3. Test the API:"
echo " ./test_api.sh 'Hello, how are you?'"
echo ""
echo "=== Available Commands ==="
echo "- Download model: ./download_model.sh <repo_id> [filename]"
echo "- Start server: ./start_server.sh"
echo "- Test API: ./test_api.sh [prompt]"
echo "- View logs: docker-compose logs -f llama-cpp-server"
echo "- Stop server: docker-compose down"
echo ""
echo "=== Parallelism Features ==="
echo "- GPU acceleration with NVIDIA support"
echo "- Multi-threading for CPU processing"
echo "- Batch processing for efficiency"
echo "- Continuous batching for multiple requests"
echo ""
echo "=== OpenAI-Compatible API ==="
echo "The server provides OpenAI-compatible endpoints:"
echo "- POST /completion - Text completion"
echo "- POST /chat/completions - Chat completions"
echo "- GET /models - List available models"