#include "blaze3_cpu.cuh" // Number of threads per thread block __constant__ const int NUM_THREADS = 16; // redefine functions, but for the GPU // all of them are the same but with g_ prefixed __constant__ const u32 g_IV[8] = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, }; __constant__ const int g_MSG_PERMUTATION[] = { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }; __device__ u32 g_rotr(u32 value, int shift) { return (value >> shift)|(value << (usize - shift)); } __device__ void g_g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) { state[a] = state[a] + state[b] + mx; state[d] = g_rotr((state[d] ^ state[a]), 16); state[c] = state[c] + state[d]; state[b] = g_rotr((state[b] ^ state[c]), 12); state[a] = state[a] + state[b] + my; state[d] = g_rotr((state[d] ^ state[a]), 8); state[c] = state[c] + state[d]; state[b] = g_rotr((state[b] ^ state[c]), 7); } __device__ void g_round(u32 state[16], u32 m[16]) { // Mix the columns. g_g(state, 0, 4, 8, 12, m[0], m[1]); g_g(state, 1, 5, 9, 13, m[2], m[3]); g_g(state, 2, 6, 10, 14, m[4], m[5]); g_g(state, 3, 7, 11, 15, m[6], m[7]); // Mix the diagonals. g_g(state, 0, 5, 10, 15, m[8], m[9]); g_g(state, 1, 6, 11, 12, m[10], m[11]); g_g(state, 2, 7, 8, 13, m[12], m[13]); g_g(state, 3, 4, 9, 14, m[14], m[15]); } __device__ void g_permute(u32 m[16]) { u32 permuted[16]; for(int i=0; i<16; i++) permuted[i] = m[g_MSG_PERMUTATION[i]]; for(int i=0; i<16; i++) m[i] = permuted[i]; } // custom memcpy, apparently cuda's memcpy is slow // when called within a kernel __device__ void g_memcpy(u32 *lhs, const u32 *rhs, int size) { // assuming u32 is 4 bytes int len = size / 4; for(int i=0; i __device__ void g_memset(ptr_t dest, T val, int count) { for(int i=0; i> 32); state[14] = block_len; state[15] = flags; u32 block[16]; g_memcpy(block, block_words, 64); g_round(state, block); // round 1 g_permute(block); g_round(state, block); // round 2 g_permute(block); g_round(state, block); // round 3 g_permute(block); g_round(state, block); // round 4 g_permute(block); g_round(state, block); // round 5 g_permute(block); g_round(state, block); // round 6 g_permute(block); g_round(state, block); // round 7 for(int i=0; i<8; i++){ state[i] ^= state[i + 8]; state[i + 8] ^= chaining_value[i]; } } __device__ void g_words_from_little_endian_bytes( u8 *bytes, u32 *words, u32 bytes_len ) { u32 tmp; for(u32 i=0; i leaf_len) block_len = leaf_len%BLOCK_LEN; else block_len = BLOCK_LEN; // special case if(empty_input) block_len = 0; // clear up block_words g_memset(block_words, 0, 16); u32 new_block_len(block_len); if(block_len%4) new_block_len += 4 - (block_len%4); // This memcpy is fine since data is a byte array memcpy(block_cast, leaf_data+i, new_block_len*sizeof(*block_cast)); g_words_from_little_endian_bytes(leaf_data+i, block_words, new_block_len); if(i==0) flagger |= CHUNK_START; if(i+BLOCK_LEN >= leaf_len) flagger |= CHUNK_END | out_flags; // raw hash for root node g_compress( chaining_value, block_words, counter, block_len, flagger, raw_hash ); g_memcpy(chaining_value, raw_hash, 32); } } __global__ void compute(Chunk *data, int l, int r) { // n is always a power of 2 int n = r-l; int tid = blockDim.x * blockIdx.x + threadIdx.x; if(tid >= n) return; if(n==1) { data[l].g_compress_chunk(); // printf("Compressing : %d\n", l); } else { compute<<>>(data, l, l+n/2); cudaDeviceSynchronize(); compute<<>>(data, l+n/2, r); cudaDeviceSynchronize(); data[l].flags |= PARENT; memcpy(data[l].data, data[l].raw_hash, 32); memcpy(data[l].data+8, data[l+n/2].raw_hash, 32); data[l].g_compress_chunk(); // printf("Compressing : %d to %d\n", l, r); } } // CPU version of light_hash (unchanged) void light_hash(Chunk *data, int N, Chunk *result, Chunk *memory_bar) { const int data_size = N*sizeof(Chunk); // Device settings // Allows DeviceSync to be called upto 16 levels of recursion cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16); // Device vector Chunk *g_data = memory_bar; cudaMemcpy(g_data, data, data_size, cudaMemcpyHostToDevice); // Actual computation of hash compute<<>>(g_data, 0, N); cudaMemcpy(result, g_data, sizeof(Chunk), cudaMemcpyDeviceToHost); } // Device-callable version of light_hash __device__ void light_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) { // Create a single chunk for processing the input Chunk chunk; // Initialize the chunk with the input data for (int i = 0; i < 8; i++) { chunk.key[i] = g_IV[i]; // Use device constant IV } // Copy the input data to leaf_data (with bounds checking) size_t copy_len = min(input_len, (size_t)BLOCK_LEN * 16); // Ensure we don't overflow for (size_t i = 0; i < copy_len; i++) { chunk.leaf_data[i] = input[i]; } chunk.leaf_len = copy_len; chunk.counter = 0; chunk.flags = 0; // Default flags // Process the chunk directly chunk.g_compress_chunk(ROOT); // Set ROOT flag for final output // Copy the raw hash to the output for (int i = 0; i < 8; i++) { // Convert 32-bit words to bytes in little-endian format output[i*4] = (uint8_t)(chunk.raw_hash[i]); output[i*4+1] = (uint8_t)(chunk.raw_hash[i] >> 8); output[i*4+2] = (uint8_t)(chunk.raw_hash[i] >> 16); output[i*4+3] = (uint8_t)(chunk.raw_hash[i] >> 24); } } // Alias for compatibility with other device code __device__ void blake3_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) { light_hash_device(input, input_len, output); }