#include #include #include #include #include #include #include // Include shared device functions #include "rinhash_device.cuh" #include "argon2d_device.cuh" #include "sha3-256.cu" #include "blake3_device.cuh" // Modified kernel to use device functions extern "C" __global__ void rinhash_cuda_kernel( const uint8_t* input, size_t input_len, uint8_t* output, block* argon2_memory ) { __shared__ uint8_t blake3_out[32]; __shared__ uint8_t argon2_out[32]; if (threadIdx.x == 0) { light_hash_device(input, input_len, blake3_out); uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11); uint8_t sha3_out[32]; sha3_256_device(argon2_out, 32, sha3_out); for (int i = 0; i < 32; i++) output[i] = sha3_out[i]; } __syncthreads(); } // RinHash CUDA implementation extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) { const uint32_t m_cost = 64; // Argon2 blocks (64 KiB) uint8_t *d_input = nullptr; uint8_t *d_output = nullptr; block *d_memory = nullptr; cudaError_t err; err = cudaMalloc(&d_input, input_len); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); return; } err = cudaMalloc(&d_output, 32); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; } err = cudaMalloc(&d_memory, m_cost * sizeof(block)); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; } err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory); err = cudaDeviceSynchronize(); if (err != cudaSuccess) { fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err)); cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); } cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); } // Helper function to convert a block header to bytes extern "C" void blockheader_to_bytes( const uint32_t* version, const uint32_t* prev_block, const uint32_t* merkle_root, const uint32_t* timestamp, const uint32_t* bits, const uint32_t* nonce, uint8_t* output, size_t* output_len ) { size_t offset = 0; memcpy(output + offset, version, 4); offset += 4; memcpy(output + offset, prev_block, 32); offset += 32; memcpy(output + offset, merkle_root, 32); offset += 32; memcpy(output + offset, timestamp, 4); offset += 4; memcpy(output + offset, bits, 4); offset += 4; memcpy(output + offset, nonce, 4); offset += 4; *output_len = offset; } // Batch processing version for mining (sequential per header for now) extern "C" void rinhash_cuda_batch( const uint8_t* block_headers, size_t block_header_len, uint8_t* outputs, uint32_t num_blocks ) { const uint32_t m_cost = 64; uint8_t *d_input = NULL; uint8_t *d_output = NULL; block *d_memory = NULL; cudaError_t err; err = cudaMalloc((void**)&d_input, block_header_len); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc header: %s\n", cudaGetErrorString(err)); return; } err = cudaMalloc((void**)&d_output, 32); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc output: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; } err = cudaMalloc((void**)&d_memory, m_cost * sizeof(block)); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc argon2 mem: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; } for (uint32_t i = 0; i < num_blocks; i++) { const uint8_t* input = block_headers + i * block_header_len; uint8_t* output = outputs + i * 32; err = cudaMemcpy(d_input, input, block_header_len, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy header %u: %s\n", i, cudaGetErrorString(err)); break; } rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory); err = cudaDeviceSynchronize(); if (err != cudaSuccess) { fprintf(stderr, "CUDA error in kernel %u: %s\n", i, cudaGetErrorString(err)); break; } err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy out %u: %s\n", i, cudaGetErrorString(err)); break; } } cudaFree(d_memory); cudaFree(d_output); cudaFree(d_input); } // Main RinHash function that would be called from outside extern "C" void RinHash( const uint32_t* version, const uint32_t* prev_block, const uint32_t* merkle_root, const uint32_t* timestamp, const uint32_t* bits, const uint32_t* nonce, uint8_t* output ) { uint8_t block_header[80]; // Standard block header size size_t block_header_len; blockheader_to_bytes( version, prev_block, merkle_root, timestamp, bits, nonce, block_header, &block_header_len ); rinhash_cuda(block_header, block_header_len, output); } // Mining function that tries different nonces extern "C" void RinHash_mine( const uint32_t* version, const uint32_t* prev_block, const uint32_t* merkle_root, const uint32_t* timestamp, const uint32_t* bits, uint32_t start_nonce, uint32_t num_nonces, uint32_t* found_nonce, uint8_t* target_hash, uint8_t* best_hash ) { const size_t block_header_len = 80; std::vector block_headers(block_header_len * num_nonces); std::vector hashes(32 * num_nonces); for (uint32_t i = 0; i < num_nonces; i++) { uint32_t current_nonce = start_nonce + i; uint8_t* header = block_headers.data() + i * block_header_len; size_t header_len; blockheader_to_bytes( version, prev_block, merkle_root, timestamp, bits, ¤t_nonce, header, &header_len ); } rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces); memcpy(best_hash, hashes.data(), 32); *found_nonce = start_nonce; for (uint32_t i = 1; i < num_nonces; i++) { uint8_t* current_hash = hashes.data() + i * 32; bool is_better = false; for (int j = 0; j < 32; j++) { if (current_hash[j] < best_hash[j]) { is_better = true; break; } else if (current_hash[j] > best_hash[j]) { break; } } if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; } } }