Files
mines/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
Dobromir Popov 00cda24e71 wip
2025-09-06 20:55:10 +03:00

328 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <vector>
#include <stdexcept>
// Include shared device functions
#include "rinhash_device.cuh"
#include "argon2d_device.cuh"
#include "sha3-256.hip.cu"
#include "blake3_device.cuh"
// TRUE parallel RinHash kernel - processes multiple nonce values simultaneously
extern "C" __global__ void rinhash_hip_kernel_batch(
const uint8_t* input_batch, // Pre-prepared batch with different nonces
size_t input_len,
uint8_t* output_batch,
block* argon2_memory,
uint32_t start_nonce,
uint32_t batch_size
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Each thread processes one nonce from the prepared batch
if (tid < batch_size) {
// Get this thread's input (80 bytes per input)
const uint8_t* input = &input_batch[tid * 80];
// Allocate per-thread memory offsets
block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread
uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output
// Step 1: BLAKE3 hash
uint8_t blake3_out[32];
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
uint8_t argon2_out[32];
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11);
// Step 3: SHA3-256 hash
sha3_256_device(argon2_out, 32, thread_output);
}
}
// Legacy single-hash kernel for compatibility
extern "C" __global__ void rinhash_hip_kernel(
const uint8_t* input,
size_t input_len,
uint8_t* output,
block* argon2_memory
) {
// Only thread 0 performs the sequential RinHash operations
if (threadIdx.x == 0) {
uint8_t blake3_out[32];
uint8_t argon2_out[32];
// Step 1: BLAKE3 hash
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
// Step 3: SHA3-256 hash
sha3_256_device(argon2_out, 32, output);
}
__syncthreads();
}
// GPU memory cache for performance optimization
static uint8_t *d_input_cache = nullptr;
static uint8_t *d_output_cache = nullptr;
static block *d_memory_cache = nullptr;
static bool gpu_memory_initialized = false;
static size_t cached_input_size = 0;
// Initialize GPU memory once (reused across all hash operations)
static bool init_gpu_memory(size_t input_len) {
if (gpu_memory_initialized && cached_input_size >= input_len) {
return true; // Memory already allocated and sufficient
}
// Clean up old memory if size changed
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
}
const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
hipError_t err;
// Allocate input buffer
err = hipMalloc(&d_input_cache, 80); // Standard block header size
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err));
return false;
}
// Allocate output buffer
err = hipMalloc(&d_output_cache, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
return false;
}
// Allocate minimal Argon2 memory for single-threaded operation
err = hipMalloc(&d_memory_cache, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err));
hipFree(d_input_cache);
hipFree(d_output_cache);
return false;
}
gpu_memory_initialized = true;
cached_input_size = 80;
return true;
}
// RinHash HIP implementation with memory reuse for optimal performance
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
// Initialize GPU memory cache on first call
if (!init_gpu_memory(input_len)) {
fprintf(stderr, "Failed to initialize GPU memory cache\n");
return;
}
hipError_t err;
// Copy input header using cached memory
err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
return;
}
// Launch minimal kernel - single block with 32 threads for optimal latency
// This reduces kernel launch overhead while maintaining GPU acceleration
dim3 blocks(1);
dim3 threads_per_block(32);
rinhash_hip_kernel<<<blocks, threads_per_block>>>(d_input_cache, input_len, d_output_cache, d_memory_cache);
// Wait for kernel completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
return;
}
// Copy the result back to host
err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
}
// Memory is kept allocated for reuse - NO hipFree() calls here!
}
// PERSISTENT GPU MEMORY - Allocate once, reuse forever! (MASSIVE PERFORMANCE BOOST)
static uint8_t *d_input_persistent = nullptr;
static uint8_t *d_output_persistent = nullptr;
static block *d_memory_persistent = nullptr;
static uint32_t persistent_max_batch = 0;
static bool persistent_memory_initialized = false;
// HIGH-PERFORMANCE batch processing with PERSISTENT memory reuse
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
hipError_t err;
// SMART MEMORY MANAGEMENT: Only reallocate if we need MORE memory
if (!persistent_memory_initialized || batch_size > persistent_max_batch) {
// Free old memory if we're expanding
if (persistent_memory_initialized) {
printf("RinHashGPU: Expanding memory from %u to %u nonces\n", persistent_max_batch, batch_size);
hipFree(d_input_persistent);
hipFree(d_output_persistent);
hipFree(d_memory_persistent);
}
// Allocate with some HEADROOM for future batches (reduce reallocations)
persistent_max_batch = batch_size * 2; // 2x headroom for growth
const size_t input_size = persistent_max_batch * 80;
const size_t output_size = persistent_max_batch * 32;
const size_t memory_size = persistent_max_batch * 64 * sizeof(block);
printf("RinHashGPU: PERSISTENT ALLOCATION: %zu MB input + %zu MB output + %zu MB Argon2 = %zu MB total (capacity: %u nonces)\n",
input_size / (1024*1024), output_size / (1024*1024), memory_size / (1024*1024),
(input_size + output_size + memory_size) / (1024*1024), persistent_max_batch);
// Allocate PERSISTENT buffers with headroom
err = hipMalloc(&d_input_persistent, input_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate persistent input (%zu MB): %s\n", input_size / (1024*1024), hipGetErrorString(err));
persistent_memory_initialized = false;
return;
}
err = hipMalloc(&d_output_persistent, output_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate persistent output (%zu MB): %s\n", output_size / (1024*1024), hipGetErrorString(err));
hipFree(d_input_persistent);
persistent_memory_initialized = false;
return;
}
err = hipMalloc(&d_memory_persistent, memory_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate persistent Argon2 memory (%zu MB): %s\n", memory_size / (1024*1024), hipGetErrorString(err));
hipFree(d_input_persistent);
hipFree(d_output_persistent);
persistent_memory_initialized = false;
return;
}
persistent_memory_initialized = true;
printf("RinHashGPU: PERSISTENT MEMORY initialized - NO MORE ALLOCATIONS until expansion needed!\n");
}
// Prepare batch input data on host
uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80);
for (uint32_t i = 0; i < batch_size; i++) {
memcpy(&host_batch[i * 80], input_template, input_len);
// Set unique nonce for each thread (at position 76-79)
uint32_t nonce = start_nonce + i;
memcpy(&host_batch[i * 80 + 76], &nonce, 4);
}
// ULTRA-FAST memory transfer using persistent buffers (NO ALLOCATION OVERHEAD)
err = hipMemcpyAsync(d_input_persistent, host_batch, batch_size * 80, hipMemcpyHostToDevice, 0);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Launch DYNAMIC INDEPENDENT MINING kernel - Each thread = independent miner!
const uint32_t miners_per_block = 1024; // 1024 independent miners per block
const uint32_t total_blocks = (batch_size + miners_per_block - 1) / miners_per_block;
dim3 blocks(total_blocks);
dim3 threads_per_block(miners_per_block);
printf("RinHashGPU: Launching %u blocks × %u threads = %u independent miners processing %u nonces\n",
total_blocks, miners_per_block, total_blocks * miners_per_block, batch_size);
rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
d_input_persistent, input_len, d_output_persistent, d_memory_persistent, start_nonce, batch_size
);
// Wait for completion
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// BLAZING-FAST result transfer using persistent output buffer
err = hipMemcpyAsync(output_batch, d_output_persistent, batch_size * 32, hipMemcpyDeviceToHost, 0);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
}
// Synchronize for completion (no GPU memory cleanup - PERSISTENT REUSE!)
hipDeviceSynchronize();
// Only free HOST memory (GPU memory stays allocated for maximum performance)
free(host_batch);
}
// Cleanup function to free GPU memory when miner shuts down
extern "C" void rinhash_hip_cleanup() {
// Clean up old cache system
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
hipFree(d_memory_cache);
d_input_cache = nullptr;
d_output_cache = nullptr;
d_memory_cache = nullptr;
gpu_memory_initialized = false;
cached_input_size = 0;
}
// Clean up new persistent system
if (persistent_memory_initialized) {
printf("RinHashGPU: Cleaning up persistent memory on shutdown\n");
hipFree(d_input_persistent);
hipFree(d_output_persistent);
hipFree(d_memory_persistent);
d_input_persistent = nullptr;
d_output_persistent = nullptr;
d_memory_persistent = nullptr;
persistent_memory_initialized = false;
persistent_max_batch = 0;
}
}
// Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
const uint32_t* nonce,
uint8_t* output,
size_t* output_len
) {
size_t offset = 0;
memcpy(output + offset, version, 4); offset += 4;
memcpy(output + offset, prev_block, 32); offset += 32;
memcpy(output + offset, merkle_root, 32); offset += 32;
memcpy(output + offset, timestamp, 4); offset += 4;
memcpy(output + offset, bits, 4); offset += 4;
memcpy(output + offset, nonce, 4); offset += 4;
*output_len = offset;
}