rockm specific implementation

This commit is contained in:
Dobromir Popov
2025-09-05 03:38:45 +03:00
parent f5b05ce531
commit 614c390692
3 changed files with 117 additions and 176 deletions

Submodule rin/miner/cpuminer/cpuminer-opt-rin added at dfbd6b03a6

View File

@@ -1,5 +1,4 @@
#include <cuda_runtime.h> #include "hip_runtime_shim.h"
#include <device_launch_parameters.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
@@ -9,100 +8,109 @@
// Include shared device functions // Include shared device functions
#include "rinhash_device.cuh" #include "rinhash_device.cuh"
#include "argon2d_device.cuh" #include "argon2d_device.cuh"
#include "sha3-256.cu" #include "sha3-256.hip.cu"
#include "blake3_device.cuh" #include "blake3_device.cuh"
// Modified kernel to use device functions and write output
// External references to our CUDA implementations
extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output);
extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len);
extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output);
// Modified kernel to use device functions
extern "C" __global__ void rinhash_cuda_kernel( extern "C" __global__ void rinhash_cuda_kernel(
const uint8_t* input, const uint8_t* input,
size_t input_len, size_t input_len,
uint8_t* output uint8_t* output,
block* argon2_memory
) { ) {
// Intermediate results in shared memory
__shared__ uint8_t blake3_out[32]; __shared__ uint8_t blake3_out[32];
__shared__ uint8_t argon2_out[32]; __shared__ uint8_t argon2_out[32];
// Only one thread should do this work
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
// Step 1: BLAKE3 hash - now using light_hash_device // Step 1: BLAKE3 hash
light_hash_device(input, input_len, blake3_out); light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash
uint32_t m_cost = 64; // Example // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
size_t memory_size = m_cost * sizeof(block);
block* d_memory = (block*)malloc(memory_size);
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11); device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
// Step 3: SHA3-256 hash // Step 3: SHA3-256 hash
uint8_t sha3_out[32]; uint8_t sha3_out[32];
sha3_256_device(argon2_out, 32, sha3_out); sha3_256_device(argon2_out, 32, sha3_out);
// Write result to output
for (int i = 0; i < 32; i++) {
output[i] = sha3_out[i];
}
} }
// Use syncthreads to ensure all threads wait for the computation to complete
__syncthreads(); __syncthreads();
} }
// RinHash CUDA implementation // RinHash HIP implementation for a single header
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) { extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
// Allocate device memory // Argon2 parameters
const uint32_t m_cost = 64; // blocks (64 KiB)
uint8_t *d_input = nullptr; uint8_t *d_input = nullptr;
uint8_t *d_output = nullptr; uint8_t *d_output = nullptr;
block *d_memory = nullptr;
cudaError_t err; cudaError_t err;
// Allocate memory on device // Allocate device buffers
err = cudaMalloc(&d_input, input_len); err = cudaMalloc(&d_input, input_len);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
return; return;
} }
err = cudaMalloc(&d_output, 32); err = cudaMalloc(&d_output, 32);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err)); fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input); cudaFree(d_input);
return; return;
} }
// Copy input data to device // Allocate Argon2 memory once per hash
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
cudaFree(d_output);
return;
}
// Copy input header
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice); err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
cudaFree(d_memory);
cudaFree(d_input); cudaFree(d_input);
cudaFree(d_output); cudaFree(d_output);
return; return;
} }
// Launch the kernel // Launch the kernel (single thread is fine for single hash)
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output); rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
// Wait for kernel to finish // Wait
err = cudaDeviceSynchronize(); err = cudaDeviceSynchronize();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err)); fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err));
cudaFree(d_memory);
cudaFree(d_input); cudaFree(d_input);
cudaFree(d_output); cudaFree(d_output);
return; return;
} }
// Copy result back to host // Copy result
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
} }
// Free device memory // Free
cudaFree(d_memory);
cudaFree(d_input); cudaFree(d_input);
cudaFree(d_output); cudaFree(d_output);
} }
// Helper function to convert a block header to bytes // Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes( extern "C" void blockheader_to_bytes(
const uint32_t* version, const uint32_t* version,
@@ -115,137 +123,83 @@ extern "C" void blockheader_to_bytes(
size_t* output_len size_t* output_len
) { ) {
size_t offset = 0; size_t offset = 0;
// Version (4 bytes) memcpy(output + offset, version, 4); offset += 4;
memcpy(output + offset, version, 4); memcpy(output + offset, prev_block, 32); offset += 32;
offset += 4; memcpy(output + offset, merkle_root, 32); offset += 32;
memcpy(output + offset, timestamp, 4); offset += 4;
// Previous block hash (32 bytes) memcpy(output + offset, bits, 4); offset += 4;
memcpy(output + offset, prev_block, 32); memcpy(output + offset, nonce, 4); offset += 4;
offset += 32;
// Merkle root (32 bytes)
memcpy(output + offset, merkle_root, 32);
offset += 32;
// Timestamp (4 bytes)
memcpy(output + offset, timestamp, 4);
offset += 4;
// Bits (4 bytes)
memcpy(output + offset, bits, 4);
offset += 4;
// Nonce (4 bytes)
memcpy(output + offset, nonce, 4);
offset += 4;
*output_len = offset; *output_len = offset;
} }
// Batch processing version for mining // Batch processing version for mining (sequential per header for correctness)
extern "C" void rinhash_cuda_batch( extern "C" void rinhash_cuda_batch(
const uint8_t* block_headers, const uint8_t* block_headers,
size_t block_header_len, size_t block_header_len,
uint8_t* outputs, uint8_t* outputs,
uint32_t num_blocks uint32_t num_blocks
) { ) {
// Reset device to clear any previous errors // Argon2 parameters
cudaError_t err = cudaDeviceReset(); const uint32_t m_cost = 64;
// Allocate reusable device buffers
uint8_t *d_input = nullptr;
uint8_t *d_output = nullptr;
block *d_memory = nullptr;
cudaError_t err;
err = cudaMalloc(&d_input, block_header_len);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to reset device: %s\n", fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err));
cudaGetErrorString(err));
return; return;
} }
err = cudaMalloc(&d_output, 32);
// Check available memory
size_t free_mem, total_mem;
err = cudaMemGetInfo(&free_mem, &total_mem);
if (err != cudaSuccess) { if (err != cudaSuccess) {
//fprintf(stderr, "CUDA error: Failed to get memory info: %s\n", fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err));
// cudaGetErrorString(err)); cudaFree(d_input);
return; return;
} }
size_t headers_size = num_blocks * block_header_len; err = cudaMalloc(&d_memory, m_cost * sizeof(block));
size_t outputs_size = num_blocks * 32;
size_t required_mem = headers_size + outputs_size;
if (required_mem > free_mem) {
fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n",
required_mem, free_mem);
return;
}
// Allocate device memory
uint8_t *d_headers = NULL;
uint8_t *d_outputs = NULL;
// Allocate memory for input block headers with error check
err = cudaMalloc((void**)&d_headers, headers_size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n", fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
headers_size, cudaGetErrorString(err)); cudaFree(d_input);
cudaFree(d_output);
return; return;
} }
// Allocate memory for output hashes with error check
err = cudaMalloc((void**)&d_outputs, outputs_size);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n",
outputs_size, cudaGetErrorString(err));
cudaFree(d_headers);
return;
}
// Copy block headers from host to device
err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n",
cudaGetErrorString(err));
cudaFree(d_headers);
cudaFree(d_outputs);
return;
}
// Process one header at a time to isolate any issues
for (uint32_t i = 0; i < num_blocks; i++) { for (uint32_t i = 0; i < num_blocks; i++) {
const uint8_t* input = d_headers + i * block_header_len; const uint8_t* header = block_headers + i * block_header_len;
uint8_t* output = d_outputs + i * 32; uint8_t* out = outputs + i * 32;
// Call rinhash_cuda_kernel with device pointers and proper launch configuration err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice);
rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output);
// Check for errors after each processing
err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err)); fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err));
cudaFree(d_headers); break;
cudaFree(d_outputs); }
return;
rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err));
break;
}
err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err));
break;
} }
} }
// Synchronize device to ensure all operations are complete cudaFree(d_memory);
err = cudaDeviceSynchronize(); cudaFree(d_output);
if (err != cudaSuccess) { cudaFree(d_input);
fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err));
cudaFree(d_headers);
cudaFree(d_outputs);
return;
}
// Copy results back from device to host
err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n",
cudaGetErrorString(err));
}
// Free device memory
cudaFree(d_headers);
cudaFree(d_outputs);
} }
// Main RinHash function that would be called from outside // Main RinHash function that would be called from outside
@@ -258,10 +212,9 @@ extern "C" void RinHash(
const uint32_t* nonce, const uint32_t* nonce,
uint8_t* output uint8_t* output
) { ) {
uint8_t block_header[80]; // Standard block header size uint8_t block_header[80];
size_t block_header_len; size_t block_header_len;
// Convert block header to bytes
blockheader_to_bytes( blockheader_to_bytes(
version, version,
prev_block, prev_block,
@@ -272,12 +225,11 @@ extern "C" void RinHash(
block_header, block_header,
&block_header_len &block_header_len
); );
// Calculate RinHash
rinhash_cuda(block_header, block_header_len, output); rinhash_cuda(block_header, block_header_len, output);
} }
// Mining function that tries different nonces // Mining function that tries different nonces (host-side best selection)
extern "C" void RinHash_mine( extern "C" void RinHash_mine(
const uint32_t* version, const uint32_t* version,
const uint32_t* prev_block, const uint32_t* prev_block,
@@ -293,15 +245,12 @@ extern "C" void RinHash_mine(
const size_t block_header_len = 80; const size_t block_header_len = 80;
std::vector<uint8_t> block_headers(block_header_len * num_nonces); std::vector<uint8_t> block_headers(block_header_len * num_nonces);
std::vector<uint8_t> hashes(32 * num_nonces); std::vector<uint8_t> hashes(32 * num_nonces);
// Prepare block headers with different nonces
for (uint32_t i = 0; i < num_nonces; i++) { for (uint32_t i = 0; i < num_nonces; i++) {
uint32_t current_nonce = start_nonce + i; uint32_t current_nonce = start_nonce + i;
// Fill in the common parts of the header
uint8_t* header = block_headers.data() + i * block_header_len; uint8_t* header = block_headers.data() + i * block_header_len;
size_t header_len; size_t header_len;
blockheader_to_bytes( blockheader_to_bytes(
version, version,
prev_block, prev_block,
@@ -313,29 +262,19 @@ extern "C" void RinHash_mine(
&header_len &header_len
); );
} }
// Calculate hashes for all nonces
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces); rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
// Find the best hash (lowest value)
memcpy(best_hash, hashes.data(), 32); memcpy(best_hash, hashes.data(), 32);
*found_nonce = start_nonce; *found_nonce = start_nonce;
for (uint32_t i = 1; i < num_nonces; i++) { for (uint32_t i = 1; i < num_nonces; i++) {
uint8_t* current_hash = hashes.data() + i * 32; uint8_t* current_hash = hashes.data() + i * 32;
// Compare current hash with best hash (byte by byte, from most significant to least)
bool is_better = false; bool is_better = false;
for (int j = 0; j < 32; j++) { for (int j = 0; j < 32; j++) {
if (current_hash[j] < best_hash[j]) { if (current_hash[j] < best_hash[j]) { is_better = true; break; }
is_better = true; else if (current_hash[j] > best_hash[j]) { break; }
break;
}
else if (current_hash[j] > best_hash[j]) {
break;
}
} }
if (is_better) { if (is_better) {
memcpy(best_hash, current_hash, 32); memcpy(best_hash, current_hash, 32);
*found_nonce = start_nonce + i; *found_nonce = start_nonce + i;

View File

@@ -20,4 +20,5 @@ cd cpuminer-opt-rinhash
make -j$(nproc) make -j$(nproc)
# Test the newly built binary # Test the newly built binary
./cpuminer -a rinhash -o stratum+tcp://192.168.0.188:3333 -u username.workername -p x -t 4 ./cpuminer -a rinhash -o stratum+tcp://192.168.0.188:3333 -u db.win -p x -t 4
cpuminer-rinhash.exe -a rinhash -o stratum+tcp://192.168.0.188:3334 -u db.win -p x -t 4