#include #include #include #include #include #include #include // Include shared device functions #include "rinhash_device.cuh" #include "argon2d_device.cuh" #include "sha3-256.hip.cu" #include "blake3_device.cuh" // TRUE parallel RinHash kernel - processes multiple nonce values simultaneously extern "C" __global__ void rinhash_hip_kernel_batch( const uint8_t* input_batch, // Pre-prepared batch with different nonces size_t input_len, uint8_t* output_batch, block* argon2_memory, uint32_t start_nonce, uint32_t batch_size ) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // Each thread processes one nonce from the prepared batch if (tid < batch_size) { // Get this thread's input (80 bytes per input) const uint8_t* input = &input_batch[tid * 80]; // Allocate per-thread memory offsets block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output // Step 1: BLAKE3 hash uint8_t blake3_out[32]; light_hash_device(input, input_len, blake3_out); // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1) uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; uint8_t argon2_out[32]; device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11); // Step 3: SHA3-256 hash sha3_256_device(argon2_out, 32, thread_output); } } // Legacy single-hash kernel for compatibility extern "C" __global__ void rinhash_hip_kernel( const uint8_t* input, size_t input_len, uint8_t* output, block* argon2_memory ) { // Only thread 0 performs the sequential RinHash operations if (threadIdx.x == 0) { uint8_t blake3_out[32]; uint8_t argon2_out[32]; // Step 1: BLAKE3 hash light_hash_device(input, input_len, blake3_out); // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1) uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11); // Step 3: SHA3-256 hash sha3_256_device(argon2_out, 32, output); } __syncthreads(); } // GPU memory cache for performance optimization static uint8_t *d_input_cache = nullptr; static uint8_t *d_output_cache = nullptr; static block *d_memory_cache = nullptr; static bool gpu_memory_initialized = false; static size_t cached_input_size = 0; // Initialize GPU memory once (reused across all hash operations) static bool init_gpu_memory(size_t input_len) { if (gpu_memory_initialized && cached_input_size >= input_len) { return true; // Memory already allocated and sufficient } // Clean up old memory if size changed if (gpu_memory_initialized) { hipFree(d_input_cache); hipFree(d_output_cache); hipFree(d_memory_cache); } const uint32_t m_cost = 64; // Argon2 blocks (64 KiB) hipError_t err; // Allocate input buffer err = hipMalloc(&d_input_cache, 80); // Standard block header size if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err)); return false; } // Allocate output buffer err = hipMalloc(&d_output_cache, 32); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err)); hipFree(d_input_cache); return false; } // Allocate minimal Argon2 memory for single-threaded operation err = hipMalloc(&d_memory_cache, m_cost * sizeof(block)); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err)); hipFree(d_input_cache); hipFree(d_output_cache); return false; } gpu_memory_initialized = true; cached_input_size = 80; return true; } // RinHash HIP implementation with memory reuse for optimal performance extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) { // Initialize GPU memory cache on first call if (!init_gpu_memory(input_len)) { fprintf(stderr, "Failed to initialize GPU memory cache\n"); return; } hipError_t err; // Copy input header using cached memory err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err)); return; } // Launch minimal kernel - single block with 32 threads for optimal latency // This reduces kernel launch overhead while maintaining GPU acceleration dim3 blocks(1); dim3 threads_per_block(32); rinhash_hip_kernel<<>>(d_input_cache, input_len, d_output_cache, d_memory_cache); // Wait for kernel completion err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err)); return; } // Copy the result back to host err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err)); } // Memory is kept allocated for reuse - NO hipFree() calls here! } // PERSISTENT GPU MEMORY - Allocate once, reuse forever! (MASSIVE PERFORMANCE BOOST) static uint8_t *d_input_persistent = nullptr; static uint8_t *d_output_persistent = nullptr; static block *d_memory_persistent = nullptr; static uint32_t persistent_max_batch = 0; static bool persistent_memory_initialized = false; // HIGH-PERFORMANCE batch processing with PERSISTENT memory reuse extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) { hipError_t err; // SMART MEMORY MANAGEMENT: Only reallocate if we need MORE memory if (!persistent_memory_initialized || batch_size > persistent_max_batch) { // Free old memory if we're expanding if (persistent_memory_initialized) { printf("RinHashGPU: Expanding memory from %u to %u nonces\n", persistent_max_batch, batch_size); hipFree(d_input_persistent); hipFree(d_output_persistent); hipFree(d_memory_persistent); } // Allocate with some HEADROOM for future batches (reduce reallocations) persistent_max_batch = batch_size * 2; // 2x headroom for growth const size_t input_size = persistent_max_batch * 80; const size_t output_size = persistent_max_batch * 32; const size_t memory_size = persistent_max_batch * 64 * sizeof(block); printf("RinHashGPU: PERSISTENT ALLOCATION: %zu MB input + %zu MB output + %zu MB Argon2 = %zu MB total (capacity: %u nonces)\n", input_size / (1024*1024), output_size / (1024*1024), memory_size / (1024*1024), (input_size + output_size + memory_size) / (1024*1024), persistent_max_batch); // Allocate PERSISTENT buffers with headroom err = hipMalloc(&d_input_persistent, input_size); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate persistent input (%zu MB): %s\n", input_size / (1024*1024), hipGetErrorString(err)); persistent_memory_initialized = false; return; } err = hipMalloc(&d_output_persistent, output_size); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate persistent output (%zu MB): %s\n", output_size / (1024*1024), hipGetErrorString(err)); hipFree(d_input_persistent); persistent_memory_initialized = false; return; } err = hipMalloc(&d_memory_persistent, memory_size); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to allocate persistent Argon2 memory (%zu MB): %s\n", memory_size / (1024*1024), hipGetErrorString(err)); hipFree(d_input_persistent); hipFree(d_output_persistent); persistent_memory_initialized = false; return; } persistent_memory_initialized = true; printf("RinHashGPU: PERSISTENT MEMORY initialized - NO MORE ALLOCATIONS until expansion needed!\n"); } // Prepare batch input data on host uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80); for (uint32_t i = 0; i < batch_size; i++) { memcpy(&host_batch[i * 80], input_template, input_len); // Set unique nonce for each thread (at position 76-79) uint32_t nonce = start_nonce + i; memcpy(&host_batch[i * 80 + 76], &nonce, 4); } // ULTRA-FAST memory transfer using persistent buffers (NO ALLOCATION OVERHEAD) err = hipMemcpyAsync(d_input_persistent, host_batch, batch_size * 80, hipMemcpyHostToDevice, 0); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err)); free(host_batch); return; } // Launch DYNAMIC INDEPENDENT MINING kernel - Each thread = independent miner! const uint32_t miners_per_block = 1024; // 1024 independent miners per block const uint32_t total_blocks = (batch_size + miners_per_block - 1) / miners_per_block; dim3 blocks(total_blocks); dim3 threads_per_block(miners_per_block); printf("RinHashGPU: Launching %u blocks × %u threads = %u independent miners processing %u nonces\n", total_blocks, miners_per_block, total_blocks * miners_per_block, batch_size); rinhash_hip_kernel_batch<<>>( d_input_persistent, input_len, d_output_persistent, d_memory_persistent, start_nonce, batch_size ); // Wait for completion err = hipDeviceSynchronize(); if (err != hipSuccess) { fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err)); free(host_batch); return; } // BLAZING-FAST result transfer using persistent output buffer err = hipMemcpyAsync(output_batch, d_output_persistent, batch_size * 32, hipMemcpyDeviceToHost, 0); if (err != hipSuccess) { fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err)); } // Synchronize for completion (no GPU memory cleanup - PERSISTENT REUSE!) hipDeviceSynchronize(); // Only free HOST memory (GPU memory stays allocated for maximum performance) free(host_batch); } // Cleanup function to free GPU memory when miner shuts down extern "C" void rinhash_hip_cleanup() { // Clean up old cache system if (gpu_memory_initialized) { hipFree(d_input_cache); hipFree(d_output_cache); hipFree(d_memory_cache); d_input_cache = nullptr; d_output_cache = nullptr; d_memory_cache = nullptr; gpu_memory_initialized = false; cached_input_size = 0; } // Clean up new persistent system if (persistent_memory_initialized) { printf("RinHashGPU: Cleaning up persistent memory on shutdown\n"); hipFree(d_input_persistent); hipFree(d_output_persistent); hipFree(d_memory_persistent); d_input_persistent = nullptr; d_output_persistent = nullptr; d_memory_persistent = nullptr; persistent_memory_initialized = false; persistent_max_batch = 0; } } // Helper function to convert a block header to bytes extern "C" void blockheader_to_bytes( const uint32_t* version, const uint32_t* prev_block, const uint32_t* merkle_root, const uint32_t* timestamp, const uint32_t* bits, const uint32_t* nonce, uint8_t* output, size_t* output_len ) { size_t offset = 0; memcpy(output + offset, version, 4); offset += 4; memcpy(output + offset, prev_block, 32); offset += 32; memcpy(output + offset, merkle_root, 32); offset += 32; memcpy(output + offset, timestamp, 4); offset += 4; memcpy(output + offset, bits, 4); offset += 4; memcpy(output + offset, nonce, 4); offset += 4; *output_len = offset; }