328 lines
12 KiB
Plaintext
328 lines
12 KiB
Plaintext
#include <hip/hip_runtime.h>
|
||
#include <hip/hip_runtime_api.h>
|
||
#include <stdint.h>
|
||
#include <stdio.h>
|
||
#include <string.h>
|
||
#include <vector>
|
||
#include <stdexcept>
|
||
|
||
// Include shared device functions
|
||
#include "rinhash_device.cuh"
|
||
#include "argon2d_device.cuh"
|
||
#include "sha3-256.hip.cu"
|
||
#include "blake3_device.cuh"
|
||
|
||
// TRUE parallel RinHash kernel - processes multiple nonce values simultaneously
|
||
extern "C" __global__ void rinhash_hip_kernel_batch(
|
||
const uint8_t* input_batch, // Pre-prepared batch with different nonces
|
||
size_t input_len,
|
||
uint8_t* output_batch,
|
||
block* argon2_memory,
|
||
uint32_t start_nonce,
|
||
uint32_t batch_size
|
||
) {
|
||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
||
// Each thread processes one nonce from the prepared batch
|
||
if (tid < batch_size) {
|
||
// Get this thread's input (80 bytes per input)
|
||
const uint8_t* input = &input_batch[tid * 80];
|
||
|
||
// Allocate per-thread memory offsets
|
||
block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread
|
||
uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output
|
||
|
||
// Step 1: BLAKE3 hash
|
||
uint8_t blake3_out[32];
|
||
light_hash_device(input, input_len, blake3_out);
|
||
|
||
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
|
||
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
||
uint8_t argon2_out[32];
|
||
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11);
|
||
|
||
// Step 3: SHA3-256 hash
|
||
sha3_256_device(argon2_out, 32, thread_output);
|
||
}
|
||
}
|
||
|
||
// Legacy single-hash kernel for compatibility
|
||
extern "C" __global__ void rinhash_hip_kernel(
|
||
const uint8_t* input,
|
||
size_t input_len,
|
||
uint8_t* output,
|
||
block* argon2_memory
|
||
) {
|
||
// Only thread 0 performs the sequential RinHash operations
|
||
if (threadIdx.x == 0) {
|
||
uint8_t blake3_out[32];
|
||
uint8_t argon2_out[32];
|
||
|
||
// Step 1: BLAKE3 hash
|
||
light_hash_device(input, input_len, blake3_out);
|
||
|
||
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
|
||
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
||
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
|
||
|
||
// Step 3: SHA3-256 hash
|
||
sha3_256_device(argon2_out, 32, output);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// GPU memory cache for performance optimization
|
||
static uint8_t *d_input_cache = nullptr;
|
||
static uint8_t *d_output_cache = nullptr;
|
||
static block *d_memory_cache = nullptr;
|
||
static bool gpu_memory_initialized = false;
|
||
static size_t cached_input_size = 0;
|
||
|
||
// Initialize GPU memory once (reused across all hash operations)
|
||
static bool init_gpu_memory(size_t input_len) {
|
||
if (gpu_memory_initialized && cached_input_size >= input_len) {
|
||
return true; // Memory already allocated and sufficient
|
||
}
|
||
|
||
// Clean up old memory if size changed
|
||
if (gpu_memory_initialized) {
|
||
hipFree(d_input_cache);
|
||
hipFree(d_output_cache);
|
||
hipFree(d_memory_cache);
|
||
}
|
||
|
||
const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
|
||
hipError_t err;
|
||
|
||
// Allocate input buffer
|
||
err = hipMalloc(&d_input_cache, 80); // Standard block header size
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err));
|
||
return false;
|
||
}
|
||
|
||
// Allocate output buffer
|
||
err = hipMalloc(&d_output_cache, 32);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err));
|
||
hipFree(d_input_cache);
|
||
return false;
|
||
}
|
||
|
||
// Allocate minimal Argon2 memory for single-threaded operation
|
||
err = hipMalloc(&d_memory_cache, m_cost * sizeof(block));
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err));
|
||
hipFree(d_input_cache);
|
||
hipFree(d_output_cache);
|
||
return false;
|
||
}
|
||
|
||
gpu_memory_initialized = true;
|
||
cached_input_size = 80;
|
||
return true;
|
||
}
|
||
|
||
// RinHash HIP implementation with memory reuse for optimal performance
|
||
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||
// Initialize GPU memory cache on first call
|
||
if (!init_gpu_memory(input_len)) {
|
||
fprintf(stderr, "Failed to initialize GPU memory cache\n");
|
||
return;
|
||
}
|
||
|
||
hipError_t err;
|
||
|
||
// Copy input header using cached memory
|
||
err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
|
||
return;
|
||
}
|
||
|
||
// Launch minimal kernel - single block with 32 threads for optimal latency
|
||
// This reduces kernel launch overhead while maintaining GPU acceleration
|
||
dim3 blocks(1);
|
||
dim3 threads_per_block(32);
|
||
rinhash_hip_kernel<<<blocks, threads_per_block>>>(d_input_cache, input_len, d_output_cache, d_memory_cache);
|
||
|
||
// Wait for kernel completion
|
||
err = hipDeviceSynchronize();
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
|
||
return;
|
||
}
|
||
|
||
// Copy the result back to host
|
||
err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
|
||
}
|
||
|
||
// Memory is kept allocated for reuse - NO hipFree() calls here!
|
||
}
|
||
|
||
// PERSISTENT GPU MEMORY - Allocate once, reuse forever! (MASSIVE PERFORMANCE BOOST)
|
||
static uint8_t *d_input_persistent = nullptr;
|
||
static uint8_t *d_output_persistent = nullptr;
|
||
static block *d_memory_persistent = nullptr;
|
||
static uint32_t persistent_max_batch = 0;
|
||
static bool persistent_memory_initialized = false;
|
||
|
||
// HIGH-PERFORMANCE batch processing with PERSISTENT memory reuse
|
||
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
|
||
hipError_t err;
|
||
|
||
// SMART MEMORY MANAGEMENT: Only reallocate if we need MORE memory
|
||
if (!persistent_memory_initialized || batch_size > persistent_max_batch) {
|
||
// Free old memory if we're expanding
|
||
if (persistent_memory_initialized) {
|
||
printf("RinHashGPU: Expanding memory from %u to %u nonces\n", persistent_max_batch, batch_size);
|
||
hipFree(d_input_persistent);
|
||
hipFree(d_output_persistent);
|
||
hipFree(d_memory_persistent);
|
||
}
|
||
|
||
// Allocate with some HEADROOM for future batches (reduce reallocations)
|
||
persistent_max_batch = batch_size * 2; // 2x headroom for growth
|
||
|
||
const size_t input_size = persistent_max_batch * 80;
|
||
const size_t output_size = persistent_max_batch * 32;
|
||
const size_t memory_size = persistent_max_batch * 64 * sizeof(block);
|
||
|
||
printf("RinHashGPU: PERSISTENT ALLOCATION: %zu MB input + %zu MB output + %zu MB Argon2 = %zu MB total (capacity: %u nonces)\n",
|
||
input_size / (1024*1024), output_size / (1024*1024), memory_size / (1024*1024),
|
||
(input_size + output_size + memory_size) / (1024*1024), persistent_max_batch);
|
||
|
||
// Allocate PERSISTENT buffers with headroom
|
||
err = hipMalloc(&d_input_persistent, input_size);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate persistent input (%zu MB): %s\n", input_size / (1024*1024), hipGetErrorString(err));
|
||
persistent_memory_initialized = false;
|
||
return;
|
||
}
|
||
|
||
err = hipMalloc(&d_output_persistent, output_size);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate persistent output (%zu MB): %s\n", output_size / (1024*1024), hipGetErrorString(err));
|
||
hipFree(d_input_persistent);
|
||
persistent_memory_initialized = false;
|
||
return;
|
||
}
|
||
|
||
err = hipMalloc(&d_memory_persistent, memory_size);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to allocate persistent Argon2 memory (%zu MB): %s\n", memory_size / (1024*1024), hipGetErrorString(err));
|
||
hipFree(d_input_persistent);
|
||
hipFree(d_output_persistent);
|
||
persistent_memory_initialized = false;
|
||
return;
|
||
}
|
||
|
||
persistent_memory_initialized = true;
|
||
printf("RinHashGPU: PERSISTENT MEMORY initialized - NO MORE ALLOCATIONS until expansion needed!\n");
|
||
}
|
||
|
||
// Prepare batch input data on host
|
||
uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80);
|
||
for (uint32_t i = 0; i < batch_size; i++) {
|
||
memcpy(&host_batch[i * 80], input_template, input_len);
|
||
// Set unique nonce for each thread (at position 76-79)
|
||
uint32_t nonce = start_nonce + i;
|
||
memcpy(&host_batch[i * 80 + 76], &nonce, 4);
|
||
}
|
||
|
||
// ULTRA-FAST memory transfer using persistent buffers (NO ALLOCATION OVERHEAD)
|
||
err = hipMemcpyAsync(d_input_persistent, host_batch, batch_size * 80, hipMemcpyHostToDevice, 0);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
|
||
free(host_batch);
|
||
return;
|
||
}
|
||
|
||
// Launch DYNAMIC INDEPENDENT MINING kernel - Each thread = independent miner!
|
||
const uint32_t miners_per_block = 1024; // 1024 independent miners per block
|
||
const uint32_t total_blocks = (batch_size + miners_per_block - 1) / miners_per_block;
|
||
|
||
dim3 blocks(total_blocks);
|
||
dim3 threads_per_block(miners_per_block);
|
||
|
||
printf("RinHashGPU: Launching %u blocks × %u threads = %u independent miners processing %u nonces\n",
|
||
total_blocks, miners_per_block, total_blocks * miners_per_block, batch_size);
|
||
|
||
rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
|
||
d_input_persistent, input_len, d_output_persistent, d_memory_persistent, start_nonce, batch_size
|
||
);
|
||
|
||
// Wait for completion
|
||
err = hipDeviceSynchronize();
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err));
|
||
free(host_batch);
|
||
return;
|
||
}
|
||
|
||
// BLAZING-FAST result transfer using persistent output buffer
|
||
err = hipMemcpyAsync(output_batch, d_output_persistent, batch_size * 32, hipMemcpyDeviceToHost, 0);
|
||
if (err != hipSuccess) {
|
||
fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
|
||
}
|
||
|
||
// Synchronize for completion (no GPU memory cleanup - PERSISTENT REUSE!)
|
||
hipDeviceSynchronize();
|
||
|
||
// Only free HOST memory (GPU memory stays allocated for maximum performance)
|
||
free(host_batch);
|
||
}
|
||
|
||
// Cleanup function to free GPU memory when miner shuts down
|
||
extern "C" void rinhash_hip_cleanup() {
|
||
// Clean up old cache system
|
||
if (gpu_memory_initialized) {
|
||
hipFree(d_input_cache);
|
||
hipFree(d_output_cache);
|
||
hipFree(d_memory_cache);
|
||
d_input_cache = nullptr;
|
||
d_output_cache = nullptr;
|
||
d_memory_cache = nullptr;
|
||
gpu_memory_initialized = false;
|
||
cached_input_size = 0;
|
||
}
|
||
|
||
// Clean up new persistent system
|
||
if (persistent_memory_initialized) {
|
||
printf("RinHashGPU: Cleaning up persistent memory on shutdown\n");
|
||
hipFree(d_input_persistent);
|
||
hipFree(d_output_persistent);
|
||
hipFree(d_memory_persistent);
|
||
d_input_persistent = nullptr;
|
||
d_output_persistent = nullptr;
|
||
d_memory_persistent = nullptr;
|
||
persistent_memory_initialized = false;
|
||
persistent_max_batch = 0;
|
||
}
|
||
}
|
||
|
||
// Helper function to convert a block header to bytes
|
||
extern "C" void blockheader_to_bytes(
|
||
const uint32_t* version,
|
||
const uint32_t* prev_block,
|
||
const uint32_t* merkle_root,
|
||
const uint32_t* timestamp,
|
||
const uint32_t* bits,
|
||
const uint32_t* nonce,
|
||
uint8_t* output,
|
||
size_t* output_len
|
||
) {
|
||
size_t offset = 0;
|
||
|
||
memcpy(output + offset, version, 4); offset += 4;
|
||
memcpy(output + offset, prev_block, 32); offset += 32;
|
||
memcpy(output + offset, merkle_root, 32); offset += 32;
|
||
memcpy(output + offset, timestamp, 4); offset += 4;
|
||
memcpy(output + offset, bits, 4); offset += 4;
|
||
memcpy(output + offset, nonce, 4); offset += 4;
|
||
|
||
*output_len = offset;
|
||
}
|