Files
mines/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
Dobromir Popov b475590b61 gpu optimizations
2025-09-06 14:20:19 +03:00

286 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <vector>
#include <stdexcept>
// Include shared device functions
#include "rinhash_device.cuh"
#include "argon2d_device.cuh"
#include "sha3-256.hip.cu"
#include "blake3_device.cuh"
// TRUE parallel RinHash kernel - processes multiple nonce values simultaneously
extern "C" __global__ void rinhash_hip_kernel_batch(
const uint8_t* input_batch, // Pre-prepared batch with different nonces
size_t input_len,
uint8_t* output_batch,
block* argon2_memory,
uint32_t start_nonce,
uint32_t batch_size
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Each thread processes one nonce from the prepared batch
if (tid < batch_size) {
// Get this thread's input (80 bytes per input)
const uint8_t* input = &input_batch[tid * 80];
// Allocate per-thread memory offsets
block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread
uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output
// Step 1: BLAKE3 hash
uint8_t blake3_out[32];
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
uint8_t argon2_out[32];
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11);
// Step 3: SHA3-256 hash
sha3_256_device(argon2_out, 32, thread_output);
}
}
// Legacy single-hash kernel for compatibility
extern "C" __global__ void rinhash_hip_kernel(
const uint8_t* input,
size_t input_len,
uint8_t* output,
block* argon2_memory
) {
// Only thread 0 performs the sequential RinHash operations
if (threadIdx.x == 0) {
uint8_t blake3_out[32];
uint8_t argon2_out[32];
// Step 1: BLAKE3 hash
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
// Step 3: SHA3-256 hash
sha3_256_device(argon2_out, 32, output);
}
__syncthreads();
}
// GPU memory cache for performance optimization
static uint8_t *d_input_cache = nullptr;
static uint8_t *d_output_cache = nullptr;
static block *d_memory_cache = nullptr;
static bool gpu_memory_initialized = false;
static size_t cached_input_size = 0;
// Initialize GPU memory once (reused across all hash operations)
static bool init_gpu_memory(size_t input_len) {
if (gpu_memory_initialized && cached_input_size >= input_len) {
return true; // Memory already allocated and sufficient
}
// Clean up old memory if size changed
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
}
const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
hipError_t err;
// Allocate input buffer
err = hipMalloc(&d_input_cache, 80); // Standard block header size
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err));
return false;
}
// Allocate output buffer
err = hipMalloc(&d_output_cache, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
return false;
}
// Allocate minimal Argon2 memory for single-threaded operation
err = hipMalloc(&d_memory_cache, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
hipFree(d_output_cache);
return false;
}
gpu_memory_initialized = true;
cached_input_size = 80;
return true;
}
// RinHash HIP implementation with memory reuse for optimal performance
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
// Initialize GPU memory cache on first call
if (!init_gpu_memory(input_len)) {
fprintf(stderr, "Failed to initialize GPU memory cache\n");
return;
}
hipError_t err;
// Copy input header using cached memory
err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
return;
}
// Launch minimal kernel - single block with 32 threads for optimal latency
// This reduces kernel launch overhead while maintaining GPU acceleration
dim3 blocks(1);
dim3 threads_per_block(32);
rinhash_hip_kernel<<<blocks, threads_per_block>>>(d_input_cache, input_len, d_output_cache, d_memory_cache);
// Wait for kernel completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
return;
}
// Copy the result back to host
err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
}
// Memory is kept allocated for reuse - NO hipFree() calls here!
}
// GPU batch processing - the KEY to real GPU performance!
// This processes 1024 different nonces simultaneously (like 1024 CPU threads)
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
// Ensure we have enough memory for batch processing
const uint32_t max_batch = 1024;
if (batch_size > max_batch) batch_size = max_batch;
// Initialize memory for batch size
static uint8_t *d_input_batch = nullptr;
static uint8_t *d_output_batch = nullptr;
static block *d_memory_batch = nullptr;
static bool batch_memory_initialized = false;
if (!batch_memory_initialized) {
hipError_t err;
// Allocate batch input buffer (1024 × 80 bytes)
err = hipMalloc(&d_input_batch, max_batch * 80);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch input: %s\n", hipGetErrorString(err));
return;
}
// Allocate batch output buffer (1024 × 32 bytes)
err = hipMalloc(&d_output_batch, max_batch * 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch output: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
return;
}
// Allocate batch Argon2 memory (1024 × 64 blocks)
err = hipMalloc(&d_memory_batch, max_batch * 64 * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch memory: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
hipFree(d_output_batch);
return;
}
batch_memory_initialized = true;
printf("RinHashGPU: Batch memory initialized for %d concurrent hashes\n", max_batch);
}
// Prepare batch input data on host
uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80);
for (uint32_t i = 0; i < batch_size; i++) {
memcpy(&host_batch[i * 80], input_template, input_len);
// Set unique nonce for each thread (at position 76-79)
uint32_t nonce = start_nonce + i;
memcpy(&host_batch[i * 80 + 76], &nonce, 4);
}
// Copy batch input to GPU
hipError_t err = hipMemcpy(d_input_batch, host_batch, batch_size * 80, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Launch batch kernel - NOW EACH THREAD PROCESSES ONE NONCE!
dim3 blocks((batch_size + 255) / 256); // Enough blocks for all threads
dim3 threads_per_block(256);
rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
d_input_batch, input_len, d_output_batch, d_memory_batch, start_nonce, batch_size
);
// Wait for completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Copy results back to host
err = hipMemcpy(output_batch, d_output_batch, batch_size * 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
}
free(host_batch);
}
// Cleanup function to free GPU memory cache when miner shuts down
extern "C" void rinhash_hip_cleanup() {
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
d_input_cache = nullptr;
d_output_cache = nullptr;
d_memory_cache = nullptr;
gpu_memory_initialized = false;
cached_input_size = 0;
}
}
// Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
const uint32_t* nonce,
uint8_t* output,
size_t* output_len
) {
size_t offset = 0;
memcpy(output + offset, version, 4); offset += 4;
memcpy(output + offset, prev_block, 32); offset += 32;
memcpy(output + offset, merkle_root, 32); offset += 32;
memcpy(output + offset, timestamp, 4); offset += 4;
memcpy(output + offset, bits, 4); offset += 4;
memcpy(output + offset, nonce, 4); offset += 4;
*output_len = offset;
}