Files
mines/rin/miner/gpu/RinHash-cuda/blake3_device.cuh
Dobromir Popov f907cbf795 wip
2025-09-05 09:30:49 +03:00

274 lines
7.7 KiB
Plaintext

#include "blaze3_cpu.cuh"
// Number of threads per thread block
__constant__ const int NUM_THREADS = 16;
// redefine functions, but for the GPU
// all of them are the same but with g_ prefixed
__constant__ const u32 g_IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
};
__constant__ const int g_MSG_PERMUTATION[] = {
2, 6, 3, 10, 7, 0, 4, 13,
1, 11, 12, 5, 9, 14, 15, 8
};
__device__ u32 g_rotr(u32 value, int shift) {
return (value >> shift)|(value << (usize - shift));
}
__device__ void g_g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
state[a] = state[a] + state[b] + mx;
state[d] = g_rotr((state[d] ^ state[a]), 16);
state[c] = state[c] + state[d];
state[b] = g_rotr((state[b] ^ state[c]), 12);
state[a] = state[a] + state[b] + my;
state[d] = g_rotr((state[d] ^ state[a]), 8);
state[c] = state[c] + state[d];
state[b] = g_rotr((state[b] ^ state[c]), 7);
}
__device__ void g_round(u32 state[16], u32 m[16]) {
// Mix the columns.
g_g(state, 0, 4, 8, 12, m[0], m[1]);
g_g(state, 1, 5, 9, 13, m[2], m[3]);
g_g(state, 2, 6, 10, 14, m[4], m[5]);
g_g(state, 3, 7, 11, 15, m[6], m[7]);
// Mix the diagonals.
g_g(state, 0, 5, 10, 15, m[8], m[9]);
g_g(state, 1, 6, 11, 12, m[10], m[11]);
g_g(state, 2, 7, 8, 13, m[12], m[13]);
g_g(state, 3, 4, 9, 14, m[14], m[15]);
}
__device__ void g_permute(u32 m[16]) {
u32 permuted[16];
for(int i=0; i<16; i++)
permuted[i] = m[g_MSG_PERMUTATION[i]];
for(int i=0; i<16; i++)
m[i] = permuted[i];
}
// custom memcpy, apparently cuda's memcpy is slow
// when called within a kernel
__device__ void g_memcpy(u32 *lhs, const u32 *rhs, int size) {
// assuming u32 is 4 bytes
int len = size / 4;
for(int i=0; i<len; i++)
lhs[i] = rhs[i];
}
// custom memset
template<typename T, typename ptr_t>
__device__ void g_memset(ptr_t dest, T val, int count) {
for(int i=0; i<count; i++)
dest[i] = val;
}
__device__ void g_compress(
u32 *chaining_value,
u32 *block_words,
u64 counter,
u32 block_len,
u32 flags,
u32 *state
) {
// Search for better alternative
g_memcpy(state, chaining_value, 32);
g_memcpy(state+8, g_IV, 16);
state[12] = (u32)counter;
state[13] = (u32)(counter >> 32);
state[14] = block_len;
state[15] = flags;
u32 block[16];
g_memcpy(block, block_words, 64);
g_round(state, block); // round 1
g_permute(block);
g_round(state, block); // round 2
g_permute(block);
g_round(state, block); // round 3
g_permute(block);
g_round(state, block); // round 4
g_permute(block);
g_round(state, block); // round 5
g_permute(block);
g_round(state, block); // round 6
g_permute(block);
g_round(state, block); // round 7
for(int i=0; i<8; i++){
state[i] ^= state[i + 8];
state[i + 8] ^= chaining_value[i];
}
}
__device__ void g_words_from_little_endian_bytes(
u8 *bytes, u32 *words, u32 bytes_len
) {
u32 tmp;
for(u32 i=0; i<bytes_len; i+=4) {
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
words[i/4] = tmp;
}
}
__device__ void Chunk::g_compress_chunk(u32 out_flags) {
if(flags&PARENT) {
g_compress(
key,
data,
0, // counter is always zero for parent nodes
BLOCK_LEN,
flags | out_flags,
raw_hash
);
return;
}
u32 chaining_value[8];
u32 block_len = BLOCK_LEN, flagger;
g_memcpy(chaining_value, key, 32);
bool empty_input = (leaf_len==0);
if(empty_input) {
for(u32 i=0; i<BLOCK_LEN; i++)
leaf_data[i] = 0U;
leaf_len = BLOCK_LEN;
}
// move all mem allocs outside loop
u32 block_words[16];
u8 block_cast[BLOCK_LEN];
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
flagger = flags;
// for the last message block
if(i+BLOCK_LEN > leaf_len)
block_len = leaf_len%BLOCK_LEN;
else
block_len = BLOCK_LEN;
// special case
if(empty_input)
block_len = 0;
// clear up block_words
g_memset(block_words, 0, 16);
u32 new_block_len(block_len);
if(block_len%4)
new_block_len += 4 - (block_len%4);
// This memcpy is fine since data is a byte array
memcpy(block_cast, leaf_data+i, new_block_len*sizeof(*block_cast));
g_words_from_little_endian_bytes(leaf_data+i, block_words, new_block_len);
if(i==0)
flagger |= CHUNK_START;
if(i+BLOCK_LEN >= leaf_len)
flagger |= CHUNK_END | out_flags;
// raw hash for root node
g_compress(
chaining_value,
block_words,
counter,
block_len,
flagger,
raw_hash
);
g_memcpy(chaining_value, raw_hash, 32);
}
}
__global__ void compute(Chunk *data, int l, int r) {
// n is always a power of 2
int n = r-l;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid >= n)
return;
if(n==1) {
data[l].g_compress_chunk();
// printf("Compressing : %d\n", l);
}
else {
// Launch child kernels without synchronization (host will handle sync)
compute<<<n/2,16>>>(data, l, l+n/2);
compute<<<n/2,16>>>(data, l+n/2, r);
// Wait for all threads in this block to finish
__syncthreads();
data[l].flags |= PARENT;
memcpy(data[l].data, data[l].raw_hash, 32);
memcpy(data[l].data+8, data[l+n/2].raw_hash, 32);
data[l].g_compress_chunk();
// printf("Compressing : %d to %d\n", l, r);
}
}
// CPU version of light_hash (unchanged)
void light_hash(Chunk *data, int N, Chunk *result, Chunk *memory_bar) {
const int data_size = N*sizeof(Chunk);
// Device settings
// Allows DeviceSync to be called upto 16 levels of recursion
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16);
// Device vector
Chunk *g_data = memory_bar;
cudaMemcpy(g_data, data, data_size, cudaMemcpyHostToDevice);
// Actual computation of hash
compute<<<N,32>>>(g_data, 0, N);
cudaMemcpy(result, g_data, sizeof(Chunk), cudaMemcpyDeviceToHost);
}
// Device-callable version of light_hash
__device__ void light_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
// Create a single chunk for processing the input
Chunk chunk;
// Initialize the chunk with the input data
for (int i = 0; i < 8; i++) {
chunk.key[i] = g_IV[i]; // Use device constant IV
}
// Copy the input data to leaf_data (with bounds checking)
size_t copy_len = min(input_len, (size_t)BLOCK_LEN * 16); // Ensure we don't overflow
for (size_t i = 0; i < copy_len; i++) {
chunk.leaf_data[i] = input[i];
}
chunk.leaf_len = copy_len;
chunk.counter = 0;
chunk.flags = 0; // Default flags
// Process the chunk directly
chunk.g_compress_chunk(ROOT); // Set ROOT flag for final output
// Copy the raw hash to the output
for (int i = 0; i < 8; i++) {
// Convert 32-bit words to bytes in little-endian format
output[i*4] = (uint8_t)(chunk.raw_hash[i]);
output[i*4+1] = (uint8_t)(chunk.raw_hash[i] >> 8);
output[i*4+2] = (uint8_t)(chunk.raw_hash[i] >> 16);
output[i*4+3] = (uint8_t)(chunk.raw_hash[i] >> 24);
}
}
// Alias for compatibility with other device code
__device__ void blake3_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
light_hash_device(input, input_len, output);
}