diff --git a/rin/miner/cpuminer/cpuminer-opt-rin b/rin/miner/cpuminer/cpuminer-opt-rin new file mode 160000 index 0000000..dfbd6b0 --- /dev/null +++ b/rin/miner/cpuminer/cpuminer-opt-rin @@ -0,0 +1 @@ +Subproject commit dfbd6b03a6c731346e405e6886063b5915f09c85 diff --git a/rin/miner/gpu/RinHash-hip/rinhash.hip.cu b/rin/miner/gpu/RinHash-hip/rinhash.hip.cu index cded0ca..fb2afd5 100644 --- a/rin/miner/gpu/RinHash-hip/rinhash.hip.cu +++ b/rin/miner/gpu/RinHash-hip/rinhash.hip.cu @@ -1,5 +1,4 @@ -#include -#include +#include "hip_runtime_shim.h" #include #include #include @@ -9,100 +8,109 @@ // Include shared device functions #include "rinhash_device.cuh" #include "argon2d_device.cuh" -#include "sha3-256.cu" +#include "sha3-256.hip.cu" #include "blake3_device.cuh" - -// External references to our CUDA implementations -extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output); -extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len); -extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output); - -// Modified kernel to use device functions +// Modified kernel to use device functions and write output extern "C" __global__ void rinhash_cuda_kernel( - const uint8_t* input, - size_t input_len, - uint8_t* output + const uint8_t* input, + size_t input_len, + uint8_t* output, + block* argon2_memory ) { - // Intermediate results in shared memory __shared__ uint8_t blake3_out[32]; __shared__ uint8_t argon2_out[32]; - // Only one thread should do this work + if (threadIdx.x == 0) { - // Step 1: BLAKE3 hash - now using light_hash_device + // Step 1: BLAKE3 hash light_hash_device(input, input_len, blake3_out); - // Step 2: Argon2d hash - uint32_t m_cost = 64; // Example - size_t memory_size = m_cost * sizeof(block); - block* d_memory = (block*)malloc(memory_size); + + // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1) uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; - device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11); - + device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11); + // Step 3: SHA3-256 hash uint8_t sha3_out[32]; sha3_256_device(argon2_out, 32, sha3_out); - + + // Write result to output + for (int i = 0; i < 32; i++) { + output[i] = sha3_out[i]; + } } - - // Use syncthreads to ensure all threads wait for the computation to complete + __syncthreads(); } -// RinHash CUDA implementation +// RinHash HIP implementation for a single header extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) { - // Allocate device memory + // Argon2 parameters + const uint32_t m_cost = 64; // blocks (64 KiB) + uint8_t *d_input = nullptr; uint8_t *d_output = nullptr; + block *d_memory = nullptr; cudaError_t err; - // Allocate memory on device + // Allocate device buffers err = cudaMalloc(&d_input, input_len); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); return; } err = cudaMalloc(&d_output, 32); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; } - // Copy input data to device + // Allocate Argon2 memory once per hash + err = cudaMalloc(&d_memory, m_cost * sizeof(block)); + if (err != cudaSuccess) { + fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + cudaFree(d_output); + return; + } + + // Copy input header err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } - // Launch the kernel - rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output); + // Launch the kernel (single thread is fine for single hash) + rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory); - // Wait for kernel to finish + // Wait err = cudaDeviceSynchronize(); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err)); + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } - // Copy result back to host + // Copy result err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); } - // Free device memory + // Free + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); } - // Helper function to convert a block header to bytes extern "C" void blockheader_to_bytes( const uint32_t* version, @@ -115,137 +123,83 @@ extern "C" void blockheader_to_bytes( size_t* output_len ) { size_t offset = 0; - - // Version (4 bytes) - memcpy(output + offset, version, 4); - offset += 4; - - // Previous block hash (32 bytes) - memcpy(output + offset, prev_block, 32); - offset += 32; - - // Merkle root (32 bytes) - memcpy(output + offset, merkle_root, 32); - offset += 32; - - // Timestamp (4 bytes) - memcpy(output + offset, timestamp, 4); - offset += 4; - - // Bits (4 bytes) - memcpy(output + offset, bits, 4); - offset += 4; - - // Nonce (4 bytes) - memcpy(output + offset, nonce, 4); - offset += 4; - + + memcpy(output + offset, version, 4); offset += 4; + memcpy(output + offset, prev_block, 32); offset += 32; + memcpy(output + offset, merkle_root, 32); offset += 32; + memcpy(output + offset, timestamp, 4); offset += 4; + memcpy(output + offset, bits, 4); offset += 4; + memcpy(output + offset, nonce, 4); offset += 4; + *output_len = offset; } -// Batch processing version for mining +// Batch processing version for mining (sequential per header for correctness) extern "C" void rinhash_cuda_batch( const uint8_t* block_headers, size_t block_header_len, uint8_t* outputs, uint32_t num_blocks ) { - // Reset device to clear any previous errors - cudaError_t err = cudaDeviceReset(); + // Argon2 parameters + const uint32_t m_cost = 64; + + // Allocate reusable device buffers + uint8_t *d_input = nullptr; + uint8_t *d_output = nullptr; + block *d_memory = nullptr; + + cudaError_t err; + + err = cudaMalloc(&d_input, block_header_len); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to reset device: %s\n", - cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err)); return; } - - - // Check available memory - size_t free_mem, total_mem; - err = cudaMemGetInfo(&free_mem, &total_mem); + + err = cudaMalloc(&d_output, 32); if (err != cudaSuccess) { - //fprintf(stderr, "CUDA error: Failed to get memory info: %s\n", - // cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); return; } - - size_t headers_size = num_blocks * block_header_len; - size_t outputs_size = num_blocks * 32; - size_t required_mem = headers_size + outputs_size; - - if (required_mem > free_mem) { - fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n", - required_mem, free_mem); - return; - } - - // Allocate device memory - uint8_t *d_headers = NULL; - uint8_t *d_outputs = NULL; - - // Allocate memory for input block headers with error check - err = cudaMalloc((void**)&d_headers, headers_size); + + err = cudaMalloc(&d_memory, m_cost * sizeof(block)); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n", - headers_size, cudaGetErrorString(err)); + fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + cudaFree(d_output); return; } - - // Allocate memory for output hashes with error check - err = cudaMalloc((void**)&d_outputs, outputs_size); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n", - outputs_size, cudaGetErrorString(err)); - cudaFree(d_headers); - return; - } - - // Copy block headers from host to device - err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n", - cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; - } - - // Process one header at a time to isolate any issues + for (uint32_t i = 0; i < num_blocks; i++) { - const uint8_t* input = d_headers + i * block_header_len; - uint8_t* output = d_outputs + i * 32; - - // Call rinhash_cuda_kernel with device pointers and proper launch configuration - rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output); - - // Check for errors after each processing - err = cudaGetLastError(); + const uint8_t* header = block_headers + i * block_header_len; + uint8_t* out = outputs + i * 32; + + err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; + fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err)); + break; + } + + rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory); + + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err)); + break; + } + + err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err)); + break; } } - - // Synchronize device to ensure all operations are complete - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; - } - - // Copy results back from device to host - err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n", - cudaGetErrorString(err)); - } - - // Free device memory - cudaFree(d_headers); - cudaFree(d_outputs); + + cudaFree(d_memory); + cudaFree(d_output); + cudaFree(d_input); } // Main RinHash function that would be called from outside @@ -258,10 +212,9 @@ extern "C" void RinHash( const uint32_t* nonce, uint8_t* output ) { - uint8_t block_header[80]; // Standard block header size + uint8_t block_header[80]; size_t block_header_len; - - // Convert block header to bytes + blockheader_to_bytes( version, prev_block, @@ -272,12 +225,11 @@ extern "C" void RinHash( block_header, &block_header_len ); - - // Calculate RinHash + rinhash_cuda(block_header, block_header_len, output); } -// Mining function that tries different nonces +// Mining function that tries different nonces (host-side best selection) extern "C" void RinHash_mine( const uint32_t* version, const uint32_t* prev_block, @@ -293,15 +245,12 @@ extern "C" void RinHash_mine( const size_t block_header_len = 80; std::vector block_headers(block_header_len * num_nonces); std::vector hashes(32 * num_nonces); - - // Prepare block headers with different nonces + for (uint32_t i = 0; i < num_nonces; i++) { uint32_t current_nonce = start_nonce + i; - - // Fill in the common parts of the header uint8_t* header = block_headers.data() + i * block_header_len; size_t header_len; - + blockheader_to_bytes( version, prev_block, @@ -313,29 +262,19 @@ extern "C" void RinHash_mine( &header_len ); } - - // Calculate hashes for all nonces + rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces); - - // Find the best hash (lowest value) + memcpy(best_hash, hashes.data(), 32); *found_nonce = start_nonce; - + for (uint32_t i = 1; i < num_nonces; i++) { uint8_t* current_hash = hashes.data() + i * 32; - - // Compare current hash with best hash (byte by byte, from most significant to least) bool is_better = false; for (int j = 0; j < 32; j++) { - if (current_hash[j] < best_hash[j]) { - is_better = true; - break; - } - else if (current_hash[j] > best_hash[j]) { - break; - } + if (current_hash[j] < best_hash[j]) { is_better = true; break; } + else if (current_hash[j] > best_hash[j]) { break; } } - if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; diff --git a/rin/miner/readme.md b/rin/miner/readme.md index 53fb967..4d64c4c 100644 --- a/rin/miner/readme.md +++ b/rin/miner/readme.md @@ -20,4 +20,5 @@ cd cpuminer-opt-rinhash make -j$(nproc) # Test the newly built binary -./cpuminer -a rinhash -o stratum+tcp://192.168.0.188:3333 -u username.workername -p x -t 4 \ No newline at end of file +./cpuminer -a rinhash -o stratum+tcp://192.168.0.188:3333 -u db.win -p x -t 4 +cpuminer-rinhash.exe -a rinhash -o stratum+tcp://192.168.0.188:3334 -u db.win -p x -t 4 \ No newline at end of file