rockm
This commit is contained in:
283
rin/miner/hip-output/rinhash.hip.cu
Normal file
283
rin/miner/hip-output/rinhash.hip.cu
Normal file
@@ -0,0 +1,283 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
// Include shared device functions
|
||||
#include "rinhash_device.cuh"
|
||||
#include "argon2d_device.cuh"
|
||||
#include "sha3-256.hip.cu"
|
||||
#include "blake3_device.cuh"
|
||||
|
||||
// Modified kernel to use device functions and write output
|
||||
extern "C" __global__ void rinhash_cuda_kernel(
|
||||
const uint8_t* input,
|
||||
size_t input_len,
|
||||
uint8_t* output,
|
||||
block* argon2_memory
|
||||
) {
|
||||
__shared__ uint8_t blake3_out[32];
|
||||
__shared__ uint8_t argon2_out[32];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
// Step 1: BLAKE3 hash
|
||||
light_hash_device(input, input_len, blake3_out);
|
||||
|
||||
// Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
|
||||
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
||||
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
|
||||
|
||||
// Step 3: SHA3-256 hash
|
||||
uint8_t sha3_out[32];
|
||||
sha3_256_device(argon2_out, 32, sha3_out);
|
||||
|
||||
// Write result to output
|
||||
for (int i = 0; i < 32; i++) {
|
||||
output[i] = sha3_out[i];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// RinHash HIP implementation for a single header
|
||||
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
// Argon2 parameters
|
||||
const uint32_t m_cost = 64; // blocks (64 KiB)
|
||||
|
||||
uint8_t *d_input = nullptr;
|
||||
uint8_t *d_output = nullptr;
|
||||
block *d_memory = nullptr;
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
// Allocate device buffers
|
||||
err = cudaMalloc(&d_input, input_len);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_output, 32);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate Argon2 memory once per hash
|
||||
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy input header
|
||||
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Launch the kernel (single thread is fine for single hash)
|
||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
|
||||
|
||||
// Wait
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy result
|
||||
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
|
||||
}
|
||||
|
||||
// Free
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
}
|
||||
|
||||
// Helper function to convert a block header to bytes
|
||||
extern "C" void blockheader_to_bytes(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
const uint32_t* nonce,
|
||||
uint8_t* output,
|
||||
size_t* output_len
|
||||
) {
|
||||
size_t offset = 0;
|
||||
|
||||
memcpy(output + offset, version, 4); offset += 4;
|
||||
memcpy(output + offset, prev_block, 32); offset += 32;
|
||||
memcpy(output + offset, merkle_root, 32); offset += 32;
|
||||
memcpy(output + offset, timestamp, 4); offset += 4;
|
||||
memcpy(output + offset, bits, 4); offset += 4;
|
||||
memcpy(output + offset, nonce, 4); offset += 4;
|
||||
|
||||
*output_len = offset;
|
||||
}
|
||||
|
||||
// Batch processing version for mining (sequential per header for correctness)
|
||||
extern "C" void rinhash_cuda_batch(
|
||||
const uint8_t* block_headers,
|
||||
size_t block_header_len,
|
||||
uint8_t* outputs,
|
||||
uint32_t num_blocks
|
||||
) {
|
||||
// Argon2 parameters
|
||||
const uint32_t m_cost = 64;
|
||||
|
||||
// Allocate reusable device buffers
|
||||
uint8_t *d_input = nullptr;
|
||||
uint8_t *d_output = nullptr;
|
||||
block *d_memory = nullptr;
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
err = cudaMalloc(&d_input, block_header_len);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_output, 32);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_blocks; i++) {
|
||||
const uint8_t* header = block_headers + i * block_header_len;
|
||||
uint8_t* out = outputs + i * 32;
|
||||
|
||||
err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
|
||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
|
||||
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
|
||||
err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_output);
|
||||
cudaFree(d_input);
|
||||
}
|
||||
|
||||
// Main RinHash function that would be called from outside
|
||||
extern "C" void RinHash(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
const uint32_t* nonce,
|
||||
uint8_t* output
|
||||
) {
|
||||
uint8_t block_header[80];
|
||||
size_t block_header_len;
|
||||
|
||||
blockheader_to_bytes(
|
||||
version,
|
||||
prev_block,
|
||||
merkle_root,
|
||||
timestamp,
|
||||
bits,
|
||||
nonce,
|
||||
block_header,
|
||||
&block_header_len
|
||||
);
|
||||
|
||||
rinhash_cuda(block_header, block_header_len, output);
|
||||
}
|
||||
|
||||
// Mining function that tries different nonces (host-side best selection)
|
||||
extern "C" void RinHash_mine(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
uint32_t start_nonce,
|
||||
uint32_t num_nonces,
|
||||
uint32_t* found_nonce,
|
||||
uint8_t* target_hash,
|
||||
uint8_t* best_hash
|
||||
) {
|
||||
const size_t block_header_len = 80;
|
||||
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
|
||||
std::vector<uint8_t> hashes(32 * num_nonces);
|
||||
|
||||
for (uint32_t i = 0; i < num_nonces; i++) {
|
||||
uint32_t current_nonce = start_nonce + i;
|
||||
uint8_t* header = block_headers.data() + i * block_header_len;
|
||||
size_t header_len;
|
||||
|
||||
blockheader_to_bytes(
|
||||
version,
|
||||
prev_block,
|
||||
merkle_root,
|
||||
timestamp,
|
||||
bits,
|
||||
¤t_nonce,
|
||||
header,
|
||||
&header_len
|
||||
);
|
||||
}
|
||||
|
||||
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
||||
|
||||
memcpy(best_hash, hashes.data(), 32);
|
||||
*found_nonce = start_nonce;
|
||||
|
||||
for (uint32_t i = 1; i < num_nonces; i++) {
|
||||
uint8_t* current_hash = hashes.data() + i * 32;
|
||||
bool is_better = false;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
if (current_hash[j] < best_hash[j]) { is_better = true; break; }
|
||||
else if (current_hash[j] > best_hash[j]) { break; }
|
||||
}
|
||||
if (is_better) {
|
||||
memcpy(best_hash, current_hash, 32);
|
||||
*found_nonce = start_nonce + i;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user