diff --git a/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh b/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh index 844bd57..90b7fcf 100644 --- a/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh +++ b/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh @@ -5,8 +5,9 @@ using namespace std; // Let's use a pinned memory vector! -#include -#include +// Removed Thrust pinned allocator dependency for portability +// #include +// #include using u32 = uint32_t; using u64 = uint64_t; @@ -228,10 +229,8 @@ void Chunk::compress_chunk(u32 out_flags) { } } -using thrust_vector = thrust::host_vector< - Chunk, - thrust::system::cuda::experimental::pinned_allocator ->; +// Fallback alias: use std::vector instead of thrust pinned host vector +using thrust_vector = std::vector; // The GPU hasher void light_hash(Chunk*, int, Chunk*, Chunk*); diff --git a/rin/miner/gpu/RinHash-cuda/build-cuda.bat b/rin/miner/gpu/RinHash-cuda/build-cuda.bat index edd40f3..15ff84b 100644 --- a/rin/miner/gpu/RinHash-cuda/build-cuda.bat +++ b/rin/miner/gpu/RinHash-cuda/build-cuda.bat @@ -52,19 +52,12 @@ echo. echo Building RinHash CUDA miner... echo. -REM Compile with NVCC -nvcc -O3 -arch=sm_50 ^ +REM Compile with NVCC (enable device linking for dynamic parallelism) +nvcc -O3 -rdc=true -arch=sm_50 ^ -gencode arch=compute_50,code=sm_50 ^ - -gencode arch=compute_52,code=sm_52 ^ - -gencode arch=compute_60,code=sm_60 ^ - -gencode arch=compute_61,code=sm_61 ^ - -gencode arch=compute_70,code=sm_70 ^ - -gencode arch=compute_75,code=sm_75 ^ - -gencode arch=compute_80,code=sm_80 ^ - -gencode arch=compute_86,code=sm_86 ^ -I. rinhash.cu sha3-256.cu ^ -o rinhash-cuda-miner.exe ^ - -lcuda -lcudart + -lcuda -lcudart -lcudadevrt if errorlevel 1 ( echo. diff --git a/rin/miner/gpu/RinHash-cuda/rinhash.cu b/rin/miner/gpu/RinHash-cuda/rinhash.cu index cded0ca..283759e 100644 --- a/rin/miner/gpu/RinHash-cuda/rinhash.cu +++ b/rin/miner/gpu/RinHash-cuda/rinhash.cu @@ -12,51 +12,36 @@ #include "sha3-256.cu" #include "blake3_device.cuh" - -// External references to our CUDA implementations -extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output); -extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len); -extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output); - // Modified kernel to use device functions extern "C" __global__ void rinhash_cuda_kernel( - const uint8_t* input, - size_t input_len, - uint8_t* output + const uint8_t* input, + size_t input_len, + uint8_t* output, + block* argon2_memory ) { - // Intermediate results in shared memory __shared__ uint8_t blake3_out[32]; __shared__ uint8_t argon2_out[32]; - // Only one thread should do this work if (threadIdx.x == 0) { - // Step 1: BLAKE3 hash - now using light_hash_device light_hash_device(input, input_len, blake3_out); - // Step 2: Argon2d hash - uint32_t m_cost = 64; // Example - size_t memory_size = m_cost * sizeof(block); - block* d_memory = (block*)malloc(memory_size); uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; - device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11); - - // Step 3: SHA3-256 hash + device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11); uint8_t sha3_out[32]; sha3_256_device(argon2_out, 32, sha3_out); - + for (int i = 0; i < 32; i++) output[i] = sha3_out[i]; } - - // Use syncthreads to ensure all threads wait for the computation to complete __syncthreads(); } // RinHash CUDA implementation extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) { - // Allocate device memory + const uint32_t m_cost = 64; // Argon2 blocks (64 KiB) + uint8_t *d_input = nullptr; uint8_t *d_output = nullptr; + block *d_memory = nullptr; cudaError_t err; - // Allocate memory on device err = cudaMalloc(&d_input, input_len); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); @@ -70,39 +55,44 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou return; } - // Copy input data to device + err = cudaMalloc(&d_memory, m_cost * sizeof(block)); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + cudaFree(d_output); + return; + } + err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } - // Launch the kernel - rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output); + rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory); - // Wait for kernel to finish err = cudaDeviceSynchronize(); if (err != cudaSuccess) { fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err)); + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); return; } - // Copy result back to host err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); } - // Free device memory + cudaFree(d_memory); cudaFree(d_input); cudaFree(d_output); } - // Helper function to convert a block header to bytes extern "C" void blockheader_to_bytes( const uint32_t* version, @@ -115,137 +105,55 @@ extern "C" void blockheader_to_bytes( size_t* output_len ) { size_t offset = 0; - - // Version (4 bytes) - memcpy(output + offset, version, 4); - offset += 4; - - // Previous block hash (32 bytes) - memcpy(output + offset, prev_block, 32); - offset += 32; - - // Merkle root (32 bytes) - memcpy(output + offset, merkle_root, 32); - offset += 32; - - // Timestamp (4 bytes) - memcpy(output + offset, timestamp, 4); - offset += 4; - - // Bits (4 bytes) - memcpy(output + offset, bits, 4); - offset += 4; - - // Nonce (4 bytes) - memcpy(output + offset, nonce, 4); - offset += 4; - + memcpy(output + offset, version, 4); offset += 4; + memcpy(output + offset, prev_block, 32); offset += 32; + memcpy(output + offset, merkle_root, 32); offset += 32; + memcpy(output + offset, timestamp, 4); offset += 4; + memcpy(output + offset, bits, 4); offset += 4; + memcpy(output + offset, nonce, 4); offset += 4; *output_len = offset; } -// Batch processing version for mining +// Batch processing version for mining (sequential per header for now) extern "C" void rinhash_cuda_batch( const uint8_t* block_headers, size_t block_header_len, uint8_t* outputs, uint32_t num_blocks ) { - // Reset device to clear any previous errors - cudaError_t err = cudaDeviceReset(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to reset device: %s\n", - cudaGetErrorString(err)); - return; - } - - - // Check available memory - size_t free_mem, total_mem; - err = cudaMemGetInfo(&free_mem, &total_mem); - if (err != cudaSuccess) { - //fprintf(stderr, "CUDA error: Failed to get memory info: %s\n", - // cudaGetErrorString(err)); - return; - } - - size_t headers_size = num_blocks * block_header_len; - size_t outputs_size = num_blocks * 32; - size_t required_mem = headers_size + outputs_size; - - if (required_mem > free_mem) { - fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n", - required_mem, free_mem); - return; - } - - // Allocate device memory - uint8_t *d_headers = NULL; - uint8_t *d_outputs = NULL; - - // Allocate memory for input block headers with error check - err = cudaMalloc((void**)&d_headers, headers_size); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n", - headers_size, cudaGetErrorString(err)); - return; - } - - // Allocate memory for output hashes with error check - err = cudaMalloc((void**)&d_outputs, outputs_size); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n", - outputs_size, cudaGetErrorString(err)); - cudaFree(d_headers); - return; - } - - // Copy block headers from host to device - err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n", - cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; - } - - // Process one header at a time to isolate any issues + const uint32_t m_cost = 64; + + uint8_t *d_input = NULL; + uint8_t *d_output = NULL; + block *d_memory = NULL; + + cudaError_t err; + + err = cudaMalloc((void**)&d_input, block_header_len); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc header: %s\n", cudaGetErrorString(err)); return; } + err = cudaMalloc((void**)&d_output, 32); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc output: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; } + err = cudaMalloc((void**)&d_memory, m_cost * sizeof(block)); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc argon2 mem: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; } + for (uint32_t i = 0; i < num_blocks; i++) { - const uint8_t* input = d_headers + i * block_header_len; - uint8_t* output = d_outputs + i * 32; - - // Call rinhash_cuda_kernel with device pointers and proper launch configuration - rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output); - - // Check for errors after each processing - err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; - } + const uint8_t* input = block_headers + i * block_header_len; + uint8_t* output = outputs + i * 32; + + err = cudaMemcpy(d_input, input, block_header_len, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy header %u: %s\n", i, cudaGetErrorString(err)); break; } + + rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory); + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error in kernel %u: %s\n", i, cudaGetErrorString(err)); break; } + + err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy out %u: %s\n", i, cudaGetErrorString(err)); break; } } - - // Synchronize device to ensure all operations are complete - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err)); - cudaFree(d_headers); - cudaFree(d_outputs); - return; - } - - // Copy results back from device to host - err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n", - cudaGetErrorString(err)); - } - - // Free device memory - cudaFree(d_headers); - cudaFree(d_outputs); + + cudaFree(d_memory); + cudaFree(d_output); + cudaFree(d_input); } // Main RinHash function that would be called from outside @@ -260,8 +168,7 @@ extern "C" void RinHash( ) { uint8_t block_header[80]; // Standard block header size size_t block_header_len; - - // Convert block header to bytes + blockheader_to_bytes( version, prev_block, @@ -272,8 +179,7 @@ extern "C" void RinHash( block_header, &block_header_len ); - - // Calculate RinHash + rinhash_cuda(block_header, block_header_len, output); } @@ -293,15 +199,11 @@ extern "C" void RinHash_mine( const size_t block_header_len = 80; std::vector block_headers(block_header_len * num_nonces); std::vector hashes(32 * num_nonces); - - // Prepare block headers with different nonces + for (uint32_t i = 0; i < num_nonces; i++) { uint32_t current_nonce = start_nonce + i; - - // Fill in the common parts of the header uint8_t* header = block_headers.data() + i * block_header_len; size_t header_len; - blockheader_to_bytes( version, prev_block, @@ -313,32 +215,18 @@ extern "C" void RinHash_mine( &header_len ); } - - // Calculate hashes for all nonces + rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces); - - // Find the best hash (lowest value) + memcpy(best_hash, hashes.data(), 32); *found_nonce = start_nonce; - for (uint32_t i = 1; i < num_nonces; i++) { uint8_t* current_hash = hashes.data() + i * 32; - - // Compare current hash with best hash (byte by byte, from most significant to least) bool is_better = false; for (int j = 0; j < 32; j++) { - if (current_hash[j] < best_hash[j]) { - is_better = true; - break; - } - else if (current_hash[j] > best_hash[j]) { - break; - } - } - - if (is_better) { - memcpy(best_hash, current_hash, 32); - *found_nonce = start_nonce + i; + if (current_hash[j] < best_hash[j]) { is_better = true; break; } + else if (current_hash[j] > best_hash[j]) { break; } } + if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; } } }