more gpu
This commit is contained in:
@@ -5,8 +5,9 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
// Let's use a pinned memory vector!
|
// Let's use a pinned memory vector!
|
||||||
#include <thrust/host_vector.h>
|
// Removed Thrust pinned allocator dependency for portability
|
||||||
#include <thrust/system/cuda/experimental/pinned_allocator.h>
|
// #include <thrust/host_vector.h>
|
||||||
|
// #include <thrust/system/cuda/experimental/pinned_allocator.h>
|
||||||
|
|
||||||
using u32 = uint32_t;
|
using u32 = uint32_t;
|
||||||
using u64 = uint64_t;
|
using u64 = uint64_t;
|
||||||
@@ -228,10 +229,8 @@ void Chunk::compress_chunk(u32 out_flags) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using thrust_vector = thrust::host_vector<
|
// Fallback alias: use std::vector instead of thrust pinned host vector
|
||||||
Chunk,
|
using thrust_vector = std::vector<Chunk>;
|
||||||
thrust::system::cuda::experimental::pinned_allocator<Chunk>
|
|
||||||
>;
|
|
||||||
|
|
||||||
// The GPU hasher
|
// The GPU hasher
|
||||||
void light_hash(Chunk*, int, Chunk*, Chunk*);
|
void light_hash(Chunk*, int, Chunk*, Chunk*);
|
||||||
|
|||||||
@@ -52,19 +52,12 @@ echo.
|
|||||||
echo Building RinHash CUDA miner...
|
echo Building RinHash CUDA miner...
|
||||||
echo.
|
echo.
|
||||||
|
|
||||||
REM Compile with NVCC
|
REM Compile with NVCC (enable device linking for dynamic parallelism)
|
||||||
nvcc -O3 -arch=sm_50 ^
|
nvcc -O3 -rdc=true -arch=sm_50 ^
|
||||||
-gencode arch=compute_50,code=sm_50 ^
|
-gencode arch=compute_50,code=sm_50 ^
|
||||||
-gencode arch=compute_52,code=sm_52 ^
|
|
||||||
-gencode arch=compute_60,code=sm_60 ^
|
|
||||||
-gencode arch=compute_61,code=sm_61 ^
|
|
||||||
-gencode arch=compute_70,code=sm_70 ^
|
|
||||||
-gencode arch=compute_75,code=sm_75 ^
|
|
||||||
-gencode arch=compute_80,code=sm_80 ^
|
|
||||||
-gencode arch=compute_86,code=sm_86 ^
|
|
||||||
-I. rinhash.cu sha3-256.cu ^
|
-I. rinhash.cu sha3-256.cu ^
|
||||||
-o rinhash-cuda-miner.exe ^
|
-o rinhash-cuda-miner.exe ^
|
||||||
-lcuda -lcudart
|
-lcuda -lcudart -lcudadevrt
|
||||||
|
|
||||||
if errorlevel 1 (
|
if errorlevel 1 (
|
||||||
echo.
|
echo.
|
||||||
|
|||||||
@@ -12,51 +12,36 @@
|
|||||||
#include "sha3-256.cu"
|
#include "sha3-256.cu"
|
||||||
#include "blake3_device.cuh"
|
#include "blake3_device.cuh"
|
||||||
|
|
||||||
|
|
||||||
// External references to our CUDA implementations
|
|
||||||
extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output);
|
|
||||||
extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len);
|
|
||||||
extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output);
|
|
||||||
|
|
||||||
// Modified kernel to use device functions
|
// Modified kernel to use device functions
|
||||||
extern "C" __global__ void rinhash_cuda_kernel(
|
extern "C" __global__ void rinhash_cuda_kernel(
|
||||||
const uint8_t* input,
|
const uint8_t* input,
|
||||||
size_t input_len,
|
size_t input_len,
|
||||||
uint8_t* output
|
uint8_t* output,
|
||||||
|
block* argon2_memory
|
||||||
) {
|
) {
|
||||||
// Intermediate results in shared memory
|
|
||||||
__shared__ uint8_t blake3_out[32];
|
__shared__ uint8_t blake3_out[32];
|
||||||
__shared__ uint8_t argon2_out[32];
|
__shared__ uint8_t argon2_out[32];
|
||||||
// Only one thread should do this work
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
// Step 1: BLAKE3 hash - now using light_hash_device
|
|
||||||
light_hash_device(input, input_len, blake3_out);
|
light_hash_device(input, input_len, blake3_out);
|
||||||
// Step 2: Argon2d hash
|
|
||||||
uint32_t m_cost = 64; // Example
|
|
||||||
size_t memory_size = m_cost * sizeof(block);
|
|
||||||
block* d_memory = (block*)malloc(memory_size);
|
|
||||||
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
||||||
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11);
|
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
|
||||||
|
|
||||||
// Step 3: SHA3-256 hash
|
|
||||||
uint8_t sha3_out[32];
|
uint8_t sha3_out[32];
|
||||||
sha3_256_device(argon2_out, 32, sha3_out);
|
sha3_256_device(argon2_out, 32, sha3_out);
|
||||||
|
for (int i = 0; i < 32; i++) output[i] = sha3_out[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use syncthreads to ensure all threads wait for the computation to complete
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
// RinHash CUDA implementation
|
// RinHash CUDA implementation
|
||||||
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
|
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||||
// Allocate device memory
|
const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
|
||||||
|
|
||||||
uint8_t *d_input = nullptr;
|
uint8_t *d_input = nullptr;
|
||||||
uint8_t *d_output = nullptr;
|
uint8_t *d_output = nullptr;
|
||||||
|
block *d_memory = nullptr;
|
||||||
|
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
|
|
||||||
// Allocate memory on device
|
|
||||||
err = cudaMalloc(&d_input, input_len);
|
err = cudaMalloc(&d_input, input_len);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
|
fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
|
||||||
@@ -70,39 +55,44 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy input data to device
|
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
fprintf(stderr, "CUDA error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_input);
|
||||||
|
cudaFree(d_output);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
|
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
|
fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_memory);
|
||||||
cudaFree(d_input);
|
cudaFree(d_input);
|
||||||
cudaFree(d_output);
|
cudaFree(d_output);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launch the kernel
|
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
|
||||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output);
|
|
||||||
|
|
||||||
// Wait for kernel to finish
|
|
||||||
err = cudaDeviceSynchronize();
|
err = cudaDeviceSynchronize();
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
|
fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
|
||||||
|
cudaFree(d_memory);
|
||||||
cudaFree(d_input);
|
cudaFree(d_input);
|
||||||
cudaFree(d_output);
|
cudaFree(d_output);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy result back to host
|
|
||||||
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
|
fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free device memory
|
cudaFree(d_memory);
|
||||||
cudaFree(d_input);
|
cudaFree(d_input);
|
||||||
cudaFree(d_output);
|
cudaFree(d_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Helper function to convert a block header to bytes
|
// Helper function to convert a block header to bytes
|
||||||
extern "C" void blockheader_to_bytes(
|
extern "C" void blockheader_to_bytes(
|
||||||
const uint32_t* version,
|
const uint32_t* version,
|
||||||
@@ -115,137 +105,55 @@ extern "C" void blockheader_to_bytes(
|
|||||||
size_t* output_len
|
size_t* output_len
|
||||||
) {
|
) {
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
memcpy(output + offset, version, 4); offset += 4;
|
||||||
// Version (4 bytes)
|
memcpy(output + offset, prev_block, 32); offset += 32;
|
||||||
memcpy(output + offset, version, 4);
|
memcpy(output + offset, merkle_root, 32); offset += 32;
|
||||||
offset += 4;
|
memcpy(output + offset, timestamp, 4); offset += 4;
|
||||||
|
memcpy(output + offset, bits, 4); offset += 4;
|
||||||
// Previous block hash (32 bytes)
|
memcpy(output + offset, nonce, 4); offset += 4;
|
||||||
memcpy(output + offset, prev_block, 32);
|
|
||||||
offset += 32;
|
|
||||||
|
|
||||||
// Merkle root (32 bytes)
|
|
||||||
memcpy(output + offset, merkle_root, 32);
|
|
||||||
offset += 32;
|
|
||||||
|
|
||||||
// Timestamp (4 bytes)
|
|
||||||
memcpy(output + offset, timestamp, 4);
|
|
||||||
offset += 4;
|
|
||||||
|
|
||||||
// Bits (4 bytes)
|
|
||||||
memcpy(output + offset, bits, 4);
|
|
||||||
offset += 4;
|
|
||||||
|
|
||||||
// Nonce (4 bytes)
|
|
||||||
memcpy(output + offset, nonce, 4);
|
|
||||||
offset += 4;
|
|
||||||
|
|
||||||
*output_len = offset;
|
*output_len = offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Batch processing version for mining
|
// Batch processing version for mining (sequential per header for now)
|
||||||
extern "C" void rinhash_cuda_batch(
|
extern "C" void rinhash_cuda_batch(
|
||||||
const uint8_t* block_headers,
|
const uint8_t* block_headers,
|
||||||
size_t block_header_len,
|
size_t block_header_len,
|
||||||
uint8_t* outputs,
|
uint8_t* outputs,
|
||||||
uint32_t num_blocks
|
uint32_t num_blocks
|
||||||
) {
|
) {
|
||||||
// Reset device to clear any previous errors
|
const uint32_t m_cost = 64;
|
||||||
cudaError_t err = cudaDeviceReset();
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
fprintf(stderr, "CUDA error: Failed to reset device: %s\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
uint8_t *d_input = NULL;
|
||||||
|
uint8_t *d_output = NULL;
|
||||||
|
block *d_memory = NULL;
|
||||||
|
|
||||||
// Check available memory
|
cudaError_t err;
|
||||||
size_t free_mem, total_mem;
|
|
||||||
err = cudaMemGetInfo(&free_mem, &total_mem);
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
//fprintf(stderr, "CUDA error: Failed to get memory info: %s\n",
|
|
||||||
// cudaGetErrorString(err));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t headers_size = num_blocks * block_header_len;
|
err = cudaMalloc((void**)&d_input, block_header_len);
|
||||||
size_t outputs_size = num_blocks * 32;
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc header: %s\n", cudaGetErrorString(err)); return; }
|
||||||
size_t required_mem = headers_size + outputs_size;
|
err = cudaMalloc((void**)&d_output, 32);
|
||||||
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc output: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; }
|
||||||
|
err = cudaMalloc((void**)&d_memory, m_cost * sizeof(block));
|
||||||
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc argon2 mem: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; }
|
||||||
|
|
||||||
if (required_mem > free_mem) {
|
|
||||||
fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n",
|
|
||||||
required_mem, free_mem);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate device memory
|
|
||||||
uint8_t *d_headers = NULL;
|
|
||||||
uint8_t *d_outputs = NULL;
|
|
||||||
|
|
||||||
// Allocate memory for input block headers with error check
|
|
||||||
err = cudaMalloc((void**)&d_headers, headers_size);
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n",
|
|
||||||
headers_size, cudaGetErrorString(err));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate memory for output hashes with error check
|
|
||||||
err = cudaMalloc((void**)&d_outputs, outputs_size);
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n",
|
|
||||||
outputs_size, cudaGetErrorString(err));
|
|
||||||
cudaFree(d_headers);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy block headers from host to device
|
|
||||||
err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice);
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
cudaFree(d_headers);
|
|
||||||
cudaFree(d_outputs);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process one header at a time to isolate any issues
|
|
||||||
for (uint32_t i = 0; i < num_blocks; i++) {
|
for (uint32_t i = 0; i < num_blocks; i++) {
|
||||||
const uint8_t* input = d_headers + i * block_header_len;
|
const uint8_t* input = block_headers + i * block_header_len;
|
||||||
uint8_t* output = d_outputs + i * 32;
|
uint8_t* output = outputs + i * 32;
|
||||||
|
|
||||||
// Call rinhash_cuda_kernel with device pointers and proper launch configuration
|
err = cudaMemcpy(d_input, input, block_header_len, cudaMemcpyHostToDevice);
|
||||||
rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output);
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy header %u: %s\n", i, cudaGetErrorString(err)); break; }
|
||||||
|
|
||||||
// Check for errors after each processing
|
rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
|
||||||
err = cudaGetLastError();
|
|
||||||
if (err != cudaSuccess) {
|
|
||||||
fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err));
|
|
||||||
cudaFree(d_headers);
|
|
||||||
cudaFree(d_outputs);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Synchronize device to ensure all operations are complete
|
|
||||||
err = cudaDeviceSynchronize();
|
err = cudaDeviceSynchronize();
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error in kernel %u: %s\n", i, cudaGetErrorString(err)); break; }
|
||||||
fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err));
|
|
||||||
cudaFree(d_headers);
|
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
||||||
cudaFree(d_outputs);
|
if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy out %u: %s\n", i, cudaGetErrorString(err)); break; }
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy results back from device to host
|
cudaFree(d_memory);
|
||||||
err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost);
|
cudaFree(d_output);
|
||||||
if (err != cudaSuccess) {
|
cudaFree(d_input);
|
||||||
fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Free device memory
|
|
||||||
cudaFree(d_headers);
|
|
||||||
cudaFree(d_outputs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Main RinHash function that would be called from outside
|
// Main RinHash function that would be called from outside
|
||||||
@@ -261,7 +169,6 @@ extern "C" void RinHash(
|
|||||||
uint8_t block_header[80]; // Standard block header size
|
uint8_t block_header[80]; // Standard block header size
|
||||||
size_t block_header_len;
|
size_t block_header_len;
|
||||||
|
|
||||||
// Convert block header to bytes
|
|
||||||
blockheader_to_bytes(
|
blockheader_to_bytes(
|
||||||
version,
|
version,
|
||||||
prev_block,
|
prev_block,
|
||||||
@@ -273,7 +180,6 @@ extern "C" void RinHash(
|
|||||||
&block_header_len
|
&block_header_len
|
||||||
);
|
);
|
||||||
|
|
||||||
// Calculate RinHash
|
|
||||||
rinhash_cuda(block_header, block_header_len, output);
|
rinhash_cuda(block_header, block_header_len, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -294,14 +200,10 @@ extern "C" void RinHash_mine(
|
|||||||
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
|
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
|
||||||
std::vector<uint8_t> hashes(32 * num_nonces);
|
std::vector<uint8_t> hashes(32 * num_nonces);
|
||||||
|
|
||||||
// Prepare block headers with different nonces
|
|
||||||
for (uint32_t i = 0; i < num_nonces; i++) {
|
for (uint32_t i = 0; i < num_nonces; i++) {
|
||||||
uint32_t current_nonce = start_nonce + i;
|
uint32_t current_nonce = start_nonce + i;
|
||||||
|
|
||||||
// Fill in the common parts of the header
|
|
||||||
uint8_t* header = block_headers.data() + i * block_header_len;
|
uint8_t* header = block_headers.data() + i * block_header_len;
|
||||||
size_t header_len;
|
size_t header_len;
|
||||||
|
|
||||||
blockheader_to_bytes(
|
blockheader_to_bytes(
|
||||||
version,
|
version,
|
||||||
prev_block,
|
prev_block,
|
||||||
@@ -314,31 +216,17 @@ extern "C" void RinHash_mine(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate hashes for all nonces
|
|
||||||
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
||||||
|
|
||||||
// Find the best hash (lowest value)
|
|
||||||
memcpy(best_hash, hashes.data(), 32);
|
memcpy(best_hash, hashes.data(), 32);
|
||||||
*found_nonce = start_nonce;
|
*found_nonce = start_nonce;
|
||||||
|
|
||||||
for (uint32_t i = 1; i < num_nonces; i++) {
|
for (uint32_t i = 1; i < num_nonces; i++) {
|
||||||
uint8_t* current_hash = hashes.data() + i * 32;
|
uint8_t* current_hash = hashes.data() + i * 32;
|
||||||
|
|
||||||
// Compare current hash with best hash (byte by byte, from most significant to least)
|
|
||||||
bool is_better = false;
|
bool is_better = false;
|
||||||
for (int j = 0; j < 32; j++) {
|
for (int j = 0; j < 32; j++) {
|
||||||
if (current_hash[j] < best_hash[j]) {
|
if (current_hash[j] < best_hash[j]) { is_better = true; break; }
|
||||||
is_better = true;
|
else if (current_hash[j] > best_hash[j]) { break; }
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (current_hash[j] > best_hash[j]) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_better) {
|
|
||||||
memcpy(best_hash, current_hash, 32);
|
|
||||||
*found_nonce = start_nonce + i;
|
|
||||||
}
|
}
|
||||||
|
if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user