using LLM for sentiment analysis
This commit is contained in:
366
setup_advanced_hf_runner.sh
Normal file
366
setup_advanced_hf_runner.sh
Normal file
@@ -0,0 +1,366 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Advanced Hugging Face Model Runner with Parallelism
|
||||
# This script sets up a Docker-based solution that mimics Docker Model Runner functionality
|
||||
# Specifically designed for HF models not available in LM Studio
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== Advanced Hugging Face Model Runner Setup ==="
|
||||
echo "Designed for models not available in LM Studio with parallelism support"
|
||||
echo ""
|
||||
|
||||
# Create project directory
|
||||
PROJECT_DIR="$HOME/hf-model-runner"
|
||||
mkdir -p "$PROJECT_DIR"
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
echo "Project directory: $PROJECT_DIR"
|
||||
|
||||
# Create Docker Compose configuration with GPU support and parallelism
|
||||
cat > docker-compose.yml << 'EOF'
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# Main model server with GPU support and parallelism
|
||||
llama-cpp-server:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
container_name: hf-model-server
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- ./models:/models
|
||||
- ./config:/config
|
||||
environment:
|
||||
- MODEL_PATH=/models
|
||||
- GPU_LAYERS=35 # Adjust based on your GPU memory
|
||||
- THREADS=8 # CPU threads for parallelism
|
||||
- BATCH_SIZE=512 # Batch size for parallel processing
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
command: >
|
||||
--model /models/current_model.gguf
|
||||
--host 0.0.0.0
|
||||
--port 8080
|
||||
--n-gpu-layers 35
|
||||
--threads 8
|
||||
--batch-size 512
|
||||
--parallel
|
||||
--cont-batching
|
||||
--ctx-size 4096
|
||||
--keep-alive 300
|
||||
--log-format json
|
||||
restart: unless-stopped
|
||||
|
||||
# Alternative: vLLM server for even better parallelism
|
||||
vllm-server:
|
||||
image: vllm/vllm-openai:latest
|
||||
container_name: hf-vllm-server
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
command: >
|
||||
--model /models/current_model
|
||||
--host 0.0.0.0
|
||||
--port 8000
|
||||
--tensor-parallel-size 1
|
||||
--gpu-memory-utilization 0.9
|
||||
--max-model-len 4096
|
||||
--trust-remote-code
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- vllm
|
||||
|
||||
# Model management service
|
||||
model-manager:
|
||||
image: python:3.11-slim
|
||||
container_name: hf-model-manager
|
||||
volumes:
|
||||
- ./models:/models
|
||||
- ./scripts:/scripts
|
||||
- ./config:/config
|
||||
working_dir: /scripts
|
||||
command: python model_manager.py
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- llama-cpp-server
|
||||
|
||||
EOF
|
||||
|
||||
# Create model management script
|
||||
mkdir -p scripts
|
||||
cat > scripts/model_manager.py << 'EOF'
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hugging Face Model Manager
|
||||
Downloads and manages HF models with GGUF format support
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download, list_repo_files
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HFModelManager:
|
||||
def __init__(self, models_dir="/models"):
|
||||
self.models_dir = Path(models_dir)
|
||||
self.models_dir.mkdir(exist_ok=True)
|
||||
self.config_file = Path("/config/models.json")
|
||||
|
||||
def list_available_models(self, repo_id):
|
||||
"""List available GGUF models in a HF repository"""
|
||||
try:
|
||||
files = list_repo_files(repo_id)
|
||||
gguf_files = [f for f in files if f.endswith('.gguf')]
|
||||
return gguf_files
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing models for {repo_id}: {e}")
|
||||
return []
|
||||
|
||||
def download_model(self, repo_id, filename=None):
|
||||
"""Download a GGUF model from Hugging Face"""
|
||||
try:
|
||||
if filename is None:
|
||||
# Get the largest GGUF file
|
||||
files = self.list_available_models(repo_id)
|
||||
if not files:
|
||||
raise ValueError(f"No GGUF files found in {repo_id}")
|
||||
|
||||
# Sort by size (largest first) - approximate by filename
|
||||
gguf_files = sorted(files, key=lambda x: x.lower(), reverse=True)
|
||||
filename = gguf_files[0]
|
||||
logger.info(f"Auto-selected model: {filename}")
|
||||
|
||||
logger.info(f"Downloading {repo_id}/{filename}...")
|
||||
|
||||
# Download the model
|
||||
model_path = hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename=filename,
|
||||
local_dir=self.models_dir,
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
|
||||
# Create symlink for current model
|
||||
current_model_path = self.models_dir / "current_model.gguf"
|
||||
if current_model_path.exists():
|
||||
current_model_path.unlink()
|
||||
current_model_path.symlink_to(Path(model_path).name)
|
||||
|
||||
logger.info(f"Model downloaded to: {model_path}")
|
||||
logger.info(f"Current model symlink: {current_model_path}")
|
||||
|
||||
return model_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading model: {e}")
|
||||
raise
|
||||
|
||||
def get_model_info(self, repo_id):
|
||||
"""Get information about a model repository"""
|
||||
try:
|
||||
# This would typically use HF API
|
||||
return {
|
||||
"repo_id": repo_id,
|
||||
"available_files": self.list_available_models(repo_id),
|
||||
"status": "available"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting model info: {e}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
manager = HFModelManager()
|
||||
|
||||
# Example: Download a specific model
|
||||
# You can modify this to download any HF model
|
||||
repo_id = "microsoft/DialoGPT-medium" # Example model
|
||||
|
||||
print(f"Managing models in: {manager.models_dir}")
|
||||
print(f"Available models: {manager.list_available_models(repo_id)}")
|
||||
|
||||
# Uncomment to download a model:
|
||||
# manager.download_model(repo_id)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
EOF
|
||||
|
||||
# Create configuration directory
|
||||
mkdir -p config
|
||||
cat > config/models.json << 'EOF'
|
||||
{
|
||||
"available_models": {
|
||||
"microsoft/DialoGPT-medium": {
|
||||
"description": "Microsoft DialoGPT Medium",
|
||||
"size": "345M",
|
||||
"format": "gguf"
|
||||
},
|
||||
"microsoft/DialoGPT-large": {
|
||||
"description": "Microsoft DialoGPT Large",
|
||||
"size": "774M",
|
||||
"format": "gguf"
|
||||
}
|
||||
},
|
||||
"current_model": null,
|
||||
"settings": {
|
||||
"gpu_layers": 35,
|
||||
"threads": 8,
|
||||
"batch_size": 512,
|
||||
"context_size": 4096
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
# Create model download script
|
||||
cat > download_model.sh << 'EOF'
|
||||
#!/bin/bash
|
||||
|
||||
# Download specific Hugging Face model
|
||||
# Usage: ./download_model.sh <repo_id> [filename]
|
||||
|
||||
REPO_ID=${1:-"microsoft/DialoGPT-medium"}
|
||||
FILENAME=${2:-""}
|
||||
|
||||
echo "=== Downloading Hugging Face Model ==="
|
||||
echo "Repository: $REPO_ID"
|
||||
echo "Filename: ${FILENAME:-"auto-select largest GGUF"}"
|
||||
echo ""
|
||||
|
||||
# Install required Python packages
|
||||
pip install huggingface_hub transformers torch
|
||||
|
||||
# Run the model manager to download the model
|
||||
docker-compose run --rm model-manager python -c "
|
||||
from model_manager import HFModelManager
|
||||
import sys
|
||||
|
||||
manager = HFModelManager()
|
||||
try:
|
||||
if '$FILENAME':
|
||||
manager.download_model('$REPO_ID', '$FILENAME')
|
||||
else:
|
||||
manager.download_model('$REPO_ID')
|
||||
print('Model downloaded successfully!')
|
||||
except Exception as e:
|
||||
print(f'Error: {e}')
|
||||
sys.exit(1)
|
||||
"
|
||||
|
||||
echo ""
|
||||
echo "=== Model Download Complete ==="
|
||||
echo "You can now start the server with: docker-compose up"
|
||||
EOF
|
||||
|
||||
chmod +x download_model.sh
|
||||
|
||||
# Create API test script
|
||||
cat > test_api.sh << 'EOF'
|
||||
#!/bin/bash
|
||||
|
||||
# Test the model API
|
||||
# Usage: ./test_api.sh [prompt]
|
||||
|
||||
PROMPT=${1:-"Hello, how are you?"}
|
||||
API_URL="http://localhost:8080/completion"
|
||||
|
||||
echo "=== Testing Model API ==="
|
||||
echo "Prompt: $PROMPT"
|
||||
echo "API URL: $API_URL"
|
||||
echo ""
|
||||
|
||||
# Test the API
|
||||
curl -X POST "$API_URL" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"prompt\": \"$PROMPT\",
|
||||
\"n_predict\": 100,
|
||||
\"temperature\": 0.7,
|
||||
\"top_p\": 0.9,
|
||||
\"stream\": false
|
||||
}" | jq '.'
|
||||
|
||||
echo ""
|
||||
echo "=== API Test Complete ==="
|
||||
EOF
|
||||
|
||||
chmod +x test_api.sh
|
||||
|
||||
# Create startup script
|
||||
cat > start_server.sh << 'EOF'
|
||||
#!/bin/bash
|
||||
|
||||
echo "=== Starting Hugging Face Model Server ==="
|
||||
echo ""
|
||||
|
||||
# Check if NVIDIA GPU is available
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
echo "NVIDIA GPU detected:"
|
||||
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits
|
||||
echo ""
|
||||
echo "Starting with GPU acceleration..."
|
||||
docker-compose up llama-cpp-server
|
||||
else
|
||||
echo "No NVIDIA GPU detected, starting with CPU only..."
|
||||
# Modify docker-compose to remove GPU requirements
|
||||
sed 's/n-gpu-layers 35/n-gpu-layers 0/' docker-compose.yml > docker-compose-cpu.yml
|
||||
docker-compose -f docker-compose-cpu.yml up llama-cpp-server
|
||||
fi
|
||||
EOF
|
||||
|
||||
chmod +x start_server.sh
|
||||
|
||||
echo ""
|
||||
echo "=== Setup Complete! ==="
|
||||
echo ""
|
||||
echo "Project directory: $PROJECT_DIR"
|
||||
echo ""
|
||||
echo "=== Next Steps ==="
|
||||
echo "1. Download a model:"
|
||||
echo " ./download_model.sh microsoft/DialoGPT-medium"
|
||||
echo ""
|
||||
echo "2. Start the server:"
|
||||
echo " ./start_server.sh"
|
||||
echo ""
|
||||
echo "3. Test the API:"
|
||||
echo " ./test_api.sh 'Hello, how are you?'"
|
||||
echo ""
|
||||
echo "=== Available Commands ==="
|
||||
echo "- Download model: ./download_model.sh <repo_id> [filename]"
|
||||
echo "- Start server: ./start_server.sh"
|
||||
echo "- Test API: ./test_api.sh [prompt]"
|
||||
echo "- View logs: docker-compose logs -f llama-cpp-server"
|
||||
echo "- Stop server: docker-compose down"
|
||||
echo ""
|
||||
echo "=== Parallelism Features ==="
|
||||
echo "- GPU acceleration with NVIDIA support"
|
||||
echo "- Multi-threading for CPU processing"
|
||||
echo "- Batch processing for efficiency"
|
||||
echo "- Continuous batching for multiple requests"
|
||||
echo ""
|
||||
echo "=== OpenAI-Compatible API ==="
|
||||
echo "The server provides OpenAI-compatible endpoints:"
|
||||
echo "- POST /completion - Text completion"
|
||||
echo "- POST /chat/completions - Chat completions"
|
||||
echo "- GET /models - List available models"
|
Reference in New Issue
Block a user