gpu optimizations

This commit is contained in:
Dobromir Popov
2025-09-06 14:20:19 +03:00
parent a4bc412ca8
commit b475590b61
9 changed files with 491 additions and 210 deletions

View File

@@ -1,5 +1,5 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>

View File

@@ -12,17 +12,52 @@
#include "sha3-256.hip.cu"
#include "blake3_device.cuh"
// Modified kernel to use device functions and write output
// TRUE parallel RinHash kernel - processes multiple nonce values simultaneously
extern "C" __global__ void rinhash_hip_kernel_batch(
const uint8_t* input_batch, // Pre-prepared batch with different nonces
size_t input_len,
uint8_t* output_batch,
block* argon2_memory,
uint32_t start_nonce,
uint32_t batch_size
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Each thread processes one nonce from the prepared batch
if (tid < batch_size) {
// Get this thread's input (80 bytes per input)
const uint8_t* input = &input_batch[tid * 80];
// Allocate per-thread memory offsets
block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread
uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output
// Step 1: BLAKE3 hash
uint8_t blake3_out[32];
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
uint8_t argon2_out[32];
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11);
// Step 3: SHA3-256 hash
sha3_256_device(argon2_out, 32, thread_output);
}
}
// Legacy single-hash kernel for compatibility
extern "C" __global__ void rinhash_hip_kernel(
const uint8_t* input,
size_t input_len,
uint8_t* output,
block* argon2_memory
) {
__shared__ uint8_t blake3_out[32];
__shared__ uint8_t argon2_out[32];
// Only thread 0 performs the sequential RinHash operations
if (threadIdx.x == 0) {
uint8_t blake3_out[32];
uint8_t argon2_out[32];
// Step 1: BLAKE3 hash
light_hash_device(input, input_len, blake3_out);
@@ -31,85 +66,199 @@ extern "C" __global__ void rinhash_hip_kernel(
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
// Step 3: SHA3-256 hash
uint8_t sha3_out[32];
sha3_256_device(argon2_out, 32, sha3_out);
// Write result to output
for (int i = 0; i < 32; i++) {
output[i] = sha3_out[i];
}
sha3_256_device(argon2_out, 32, output);
}
__syncthreads();
}
// RinHash HIP implementation for a single header
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
// Argon2 parameters
const uint32_t m_cost = 64; // blocks (64 KiB)
// GPU memory cache for performance optimization
static uint8_t *d_input_cache = nullptr;
static uint8_t *d_output_cache = nullptr;
static block *d_memory_cache = nullptr;
static bool gpu_memory_initialized = false;
static size_t cached_input_size = 0;
uint8_t *d_input = nullptr;
uint8_t *d_output = nullptr;
block *d_memory = nullptr;
// Initialize GPU memory once (reused across all hash operations)
static bool init_gpu_memory(size_t input_len) {
if (gpu_memory_initialized && cached_input_size >= input_len) {
return true; // Memory already allocated and sufficient
}
// Clean up old memory if size changed
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
}
const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
hipError_t err;
// Allocate input buffer
err = hipMalloc(&d_input_cache, 80); // Standard block header size
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err));
return false;
}
// Allocate output buffer
err = hipMalloc(&d_output_cache, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
return false;
}
// Allocate minimal Argon2 memory for single-threaded operation
err = hipMalloc(&d_memory_cache, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
hipFree(d_output_cache);
return false;
}
gpu_memory_initialized = true;
cached_input_size = 80;
return true;
}
// RinHash HIP implementation with memory reuse for optimal performance
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
// Initialize GPU memory cache on first call
if (!init_gpu_memory(input_len)) {
fprintf(stderr, "Failed to initialize GPU memory cache\n");
return;
}
hipError_t err;
// Allocate device buffers
err = hipMalloc(&d_input, input_len);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", hipGetErrorString(err));
return;
}
err = hipMalloc(&d_output, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
return;
}
// Allocate Argon2 memory once per hash
err = hipMalloc(&d_memory, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
hipFree(d_output);
return;
}
// Copy input header
err = hipMemcpy(d_input, input, input_len, hipMemcpyHostToDevice);
// Copy input header using cached memory
err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
return;
}
// Launch the kernel (single thread is fine for single hash)
rinhash_hip_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
// Launch minimal kernel - single block with 32 threads for optimal latency
// This reduces kernel launch overhead while maintaining GPU acceleration
dim3 blocks(1);
dim3 threads_per_block(32);
rinhash_hip_kernel<<<blocks, threads_per_block>>>(d_input_cache, input_len, d_output_cache, d_memory_cache);
// Wait
// Wait for kernel completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
return;
}
// Copy result
err = hipMemcpy(output, d_output, 32, hipMemcpyDeviceToHost);
// Copy the result back to host
err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
}
// Free
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
// Memory is kept allocated for reuse - NO hipFree() calls here!
}
// GPU batch processing - the KEY to real GPU performance!
// This processes 1024 different nonces simultaneously (like 1024 CPU threads)
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
// Ensure we have enough memory for batch processing
const uint32_t max_batch = 1024;
if (batch_size > max_batch) batch_size = max_batch;
// Initialize memory for batch size
static uint8_t *d_input_batch = nullptr;
static uint8_t *d_output_batch = nullptr;
static block *d_memory_batch = nullptr;
static bool batch_memory_initialized = false;
if (!batch_memory_initialized) {
hipError_t err;
// Allocate batch input buffer (1024 × 80 bytes)
err = hipMalloc(&d_input_batch, max_batch * 80);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch input: %s\n", hipGetErrorString(err));
return;
}
// Allocate batch output buffer (1024 × 32 bytes)
err = hipMalloc(&d_output_batch, max_batch * 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch output: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
return;
}
// Allocate batch Argon2 memory (1024 × 64 blocks)
err = hipMalloc(&d_memory_batch, max_batch * 64 * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch memory: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
hipFree(d_output_batch);
return;
}
batch_memory_initialized = true;
printf("RinHashGPU: Batch memory initialized for %d concurrent hashes\n", max_batch);
}
// Prepare batch input data on host
uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80);
for (uint32_t i = 0; i < batch_size; i++) {
memcpy(&host_batch[i * 80], input_template, input_len);
// Set unique nonce for each thread (at position 76-79)
uint32_t nonce = start_nonce + i;
memcpy(&host_batch[i * 80 + 76], &nonce, 4);
}
// Copy batch input to GPU
hipError_t err = hipMemcpy(d_input_batch, host_batch, batch_size * 80, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Launch batch kernel - NOW EACH THREAD PROCESSES ONE NONCE!
dim3 blocks((batch_size + 255) / 256); // Enough blocks for all threads
dim3 threads_per_block(256);
rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
d_input_batch, input_len, d_output_batch, d_memory_batch, start_nonce, batch_size
);
// Wait for completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Copy results back to host
err = hipMemcpy(output_batch, d_output_batch, batch_size * 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
}
free(host_batch);
}
// Cleanup function to free GPU memory cache when miner shuts down
extern "C" void rinhash_hip_cleanup() {
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
d_input_cache = nullptr;
d_output_cache = nullptr;
d_memory_cache = nullptr;
gpu_memory_initialized = false;
cached_input_size = 0;
}
}
// Helper function to convert a block header to bytes
@@ -134,151 +283,3 @@ extern "C" void blockheader_to_bytes(
*output_len = offset;
}
// Batch processing version for mining (sequential per header for correctness)
extern "C" void rinhash_hip_batch(
const uint8_t* block_headers,
size_t block_header_len,
uint8_t* outputs,
uint32_t num_blocks
) {
// Argon2 parameters
const uint32_t m_cost = 64;
// Allocate reusable device buffers
uint8_t *d_input = nullptr;
uint8_t *d_output = nullptr;
block *d_memory = nullptr;
hipError_t err;
err = hipMalloc(&d_input, block_header_len);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", hipGetErrorString(err));
return;
}
err = hipMalloc(&d_output, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", hipGetErrorString(err));
hipFree(d_input);
return;
}
err = hipMalloc(&d_memory, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
hipFree(d_output);
return;
}
for (uint32_t i = 0; i < num_blocks; i++) {
const uint8_t* header = block_headers + i * block_header_len;
uint8_t* out = outputs + i * 32;
err = hipMemcpy(d_input, header, block_header_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, hipGetErrorString(err));
break;
}
rinhash_hip_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error in kernel %u: %s\n", i, hipGetErrorString(err));
break;
}
err = hipMemcpy(out, d_output, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, hipGetErrorString(err));
break;
}
}
hipFree(d_memory);
hipFree(d_output);
hipFree(d_input);
}
// Main RinHash function that would be called from outside
extern "C" void RinHash(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
const uint32_t* nonce,
uint8_t* output
) {
uint8_t block_header[80];
size_t block_header_len;
blockheader_to_bytes(
version,
prev_block,
merkle_root,
timestamp,
bits,
nonce,
block_header,
&block_header_len
);
rinhash_hip(block_header, block_header_len, output);
}
// Mining function that tries different nonces (host-side best selection)
extern "C" void RinHash_mine(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
uint32_t start_nonce,
uint32_t num_nonces,
uint32_t* found_nonce,
uint8_t* target_hash,
uint8_t* best_hash
) {
const size_t block_header_len = 80;
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
std::vector<uint8_t> hashes(32 * num_nonces);
for (uint32_t i = 0; i < num_nonces; i++) {
uint32_t current_nonce = start_nonce + i;
uint8_t* header = block_headers.data() + i * block_header_len;
size_t header_len;
blockheader_to_bytes(
version,
prev_block,
merkle_root,
timestamp,
bits,
&current_nonce,
header,
&header_len
);
}
rinhash_hip_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
memcpy(best_hash, hashes.data(), 32);
*found_nonce = start_nonce;
for (uint32_t i = 1; i < num_nonces; i++) {
uint8_t* current_hash = hashes.data() + i * 32;
bool is_better = false;
for (int j = 0; j < 32; j++) {
if (current_hash[j] < best_hash[j]) { is_better = true; break; }
else if (current_hash[j] > best_hash[j]) { break; }
}
if (is_better) {
memcpy(best_hash, current_hash, 32);
*found_nonce = start_nonce + i;
}
}
}