419 lines
12 KiB
Plaintext
419 lines
12 KiB
Plaintext
#include <iostream>
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
#include <vector>
|
|
using namespace std;
|
|
|
|
// Let's use a pinned memory vector!
|
|
// Removed Thrust pinned allocator dependency for portability
|
|
// #include <thrust/host_vector.h>
|
|
// #include <thrust/system/cuda/experimental/pinned_allocator.h>
|
|
|
|
using u32 = uint32_t;
|
|
using u64 = uint64_t;
|
|
using u8 = uint8_t;
|
|
|
|
const u32 OUT_LEN = 32;
|
|
const u32 KEY_LEN = 32;
|
|
const u32 BLOCK_LEN = 64;
|
|
const u32 CHUNK_LEN = 1024;
|
|
// Multiple chunks make a snicker bar :)
|
|
const u32 SNICKER = 1U << 10;
|
|
// Factory height and snicker size have an inversly propotional relationship
|
|
// FACTORY_HT * (log2 SNICKER) + 10 >= 64
|
|
const u32 FACTORY_HT = 5;
|
|
|
|
const u32 CHUNK_START = 1 << 0;
|
|
const u32 CHUNK_END = 1 << 1;
|
|
const u32 PARENT = 1 << 2;
|
|
const u32 ROOT = 1 << 3;
|
|
const u32 KEYED_HASH = 1 << 4;
|
|
const u32 DERIVE_KEY_CONTEXT = 1 << 5;
|
|
const u32 DERIVE_KEY_MATERIAL = 1 << 6;
|
|
|
|
const int usize = sizeof(u32) * 8;
|
|
|
|
u32 IV[8] = {
|
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
|
};
|
|
|
|
const int MSG_PERMUTATION[] = {
|
|
2, 6, 3, 10, 7, 0, 4, 13,
|
|
1, 11, 12, 5, 9, 14, 15, 8
|
|
};
|
|
|
|
u32 rotr(u32 value, int shift) {
|
|
return (value >> shift)|(value << (usize - shift));
|
|
}
|
|
|
|
void g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
|
|
state[a] = state[a] + state[b] + mx;
|
|
state[d] = rotr((state[d] ^ state[a]), 16);
|
|
state[c] = state[c] + state[d];
|
|
|
|
state[b] = rotr((state[b] ^ state[c]), 12);
|
|
state[a] = state[a] + state[b] + my;
|
|
state[d] = rotr((state[d] ^ state[a]), 8);
|
|
|
|
state[c] = state[c] + state[d];
|
|
state[b] = rotr((state[b] ^ state[c]), 7);
|
|
}
|
|
|
|
void round(u32 state[16], u32 m[16]) {
|
|
// Mix the columns.
|
|
g(state, 0, 4, 8, 12, m[0], m[1]);
|
|
g(state, 1, 5, 9, 13, m[2], m[3]);
|
|
g(state, 2, 6, 10, 14, m[4], m[5]);
|
|
g(state, 3, 7, 11, 15, m[6], m[7]);
|
|
// Mix the diagonals.
|
|
g(state, 0, 5, 10, 15, m[8], m[9]);
|
|
g(state, 1, 6, 11, 12, m[10], m[11]);
|
|
g(state, 2, 7, 8, 13, m[12], m[13]);
|
|
g(state, 3, 4, 9, 14, m[14], m[15]);
|
|
}
|
|
|
|
void permute(u32 m[16]) {
|
|
u32 permuted[16];
|
|
for(int i=0; i<16; i++)
|
|
permuted[i] = m[MSG_PERMUTATION[i]];
|
|
for(int i=0; i<16; i++)
|
|
m[i] = permuted[i];
|
|
}
|
|
|
|
void compress(
|
|
u32 *chaining_value,
|
|
u32 *block_words,
|
|
u64 counter,
|
|
u32 block_len,
|
|
u32 flags,
|
|
u32 *state
|
|
) {
|
|
memcpy(state, chaining_value, 8*sizeof(*state));
|
|
memcpy(state+8, IV, 4*sizeof(*state));
|
|
state[12] = (u32)counter;
|
|
state[13] = (u32)(counter >> 32);
|
|
state[14] = block_len;
|
|
state[15] = flags;
|
|
|
|
u32 block[16];
|
|
memcpy(block, block_words, 16*sizeof(*block));
|
|
|
|
round(state, block); // round 1
|
|
permute(block);
|
|
round(state, block); // round 2
|
|
permute(block);
|
|
round(state, block); // round 3
|
|
permute(block);
|
|
round(state, block); // round 4
|
|
permute(block);
|
|
round(state, block); // round 5
|
|
permute(block);
|
|
round(state, block); // round 6
|
|
permute(block);
|
|
round(state, block); // round 7
|
|
|
|
for(int i=0; i<8; i++){
|
|
state[i] ^= state[i + 8];
|
|
state[i + 8] ^= chaining_value[i];
|
|
}
|
|
}
|
|
|
|
void words_from_little_endian_bytes(u8 *bytes, u32 *words, u32 bytes_len) {
|
|
u32 tmp;
|
|
for(u32 i=0; i<bytes_len; i+=4) {
|
|
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
|
|
words[i/4] = tmp;
|
|
}
|
|
}
|
|
|
|
struct Chunk {
|
|
// use only when it is a leaf node
|
|
// leaf data may have less than 1024 bytes
|
|
u8 leaf_data[1024];
|
|
u32 leaf_len;
|
|
// use in all other cases
|
|
// data will always have 64 bytes
|
|
u32 data[16];
|
|
u32 flags;
|
|
u32 raw_hash[16];
|
|
u32 key[8];
|
|
// only useful for leaf nodes
|
|
u64 counter;
|
|
// Constructor for leaf nodes
|
|
__device__ __host__ Chunk(char *input, int size, u32 _flags, u32 *_key, u64 ctr){
|
|
counter = ctr;
|
|
flags = _flags;
|
|
memcpy(key, _key, 8*sizeof(*key));
|
|
memset(leaf_data, 0, 1024);
|
|
memcpy(leaf_data, input, size);
|
|
leaf_len = size;
|
|
}
|
|
__device__ __host__ Chunk(u32 _flags, u32 *_key) {
|
|
counter = 0;
|
|
flags = _flags;
|
|
memcpy(key, _key, 8*sizeof(*key));
|
|
leaf_len = 0;
|
|
}
|
|
__device__ __host__ Chunk() {}
|
|
// Chunk() : leaf_len(0) {}
|
|
// process data in sizes of message blocks and store cv in hash
|
|
void compress_chunk(u32=0);
|
|
__device__ void g_compress_chunk(u32=0);
|
|
};
|
|
|
|
void Chunk::compress_chunk(u32 out_flags) {
|
|
if(flags&PARENT) {
|
|
compress(
|
|
key,
|
|
data,
|
|
0, // counter is always zero for parent nodes
|
|
BLOCK_LEN,
|
|
flags | out_flags,
|
|
raw_hash
|
|
);
|
|
return;
|
|
}
|
|
|
|
u32 chaining_value[8], block_len = BLOCK_LEN, flagger;
|
|
memcpy(chaining_value, key, 8*sizeof(*chaining_value));
|
|
|
|
bool empty_input = (leaf_len==0);
|
|
if(empty_input) {
|
|
for(u32 i=0; i<BLOCK_LEN; i++)
|
|
leaf_data[i] = 0U;
|
|
leaf_len = BLOCK_LEN;
|
|
}
|
|
|
|
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
|
|
flagger = flags;
|
|
// for the last message block
|
|
if(i+BLOCK_LEN > leaf_len)
|
|
block_len = leaf_len%BLOCK_LEN;
|
|
else
|
|
block_len = BLOCK_LEN;
|
|
|
|
// special case
|
|
if(empty_input)
|
|
block_len = 0;
|
|
|
|
u32 block_words[16];
|
|
memset(block_words, 0, 16*sizeof(*block_words));
|
|
u32 new_block_len(block_len);
|
|
if(block_len%4)
|
|
new_block_len += 4 - (block_len%4);
|
|
|
|
// BLOCK_LEN is the max possible length of block_cast
|
|
u8 block_cast[BLOCK_LEN];
|
|
memset(block_cast, 0, new_block_len*sizeof(*block_cast));
|
|
memcpy(block_cast, leaf_data+i, block_len*sizeof(*block_cast));
|
|
|
|
words_from_little_endian_bytes(block_cast, block_words, new_block_len);
|
|
|
|
if(i==0)
|
|
flagger |= CHUNK_START;
|
|
if(i+BLOCK_LEN >= leaf_len)
|
|
flagger |= CHUNK_END | out_flags;
|
|
|
|
// raw hash for root node
|
|
compress(
|
|
chaining_value,
|
|
block_words,
|
|
counter,
|
|
block_len,
|
|
flagger,
|
|
raw_hash
|
|
);
|
|
|
|
memcpy(chaining_value, raw_hash, 8*sizeof(*chaining_value));
|
|
}
|
|
}
|
|
|
|
// Fallback alias: use std::vector instead of thrust pinned host vector
|
|
using thrust_vector = std::vector<Chunk>;
|
|
|
|
// The GPU hasher
|
|
void light_hash(Chunk*, int, Chunk*, Chunk*);
|
|
|
|
// Sanity checks
|
|
Chunk hash_many(Chunk *data, int first, int last, Chunk *memory_bar) {
|
|
// n will always be a power of 2
|
|
int n = last-first;
|
|
// Reduce GPU calling overhead
|
|
if(n == 1) {
|
|
data[first].compress_chunk();
|
|
return data[first];
|
|
}
|
|
|
|
Chunk ret;
|
|
light_hash(data+first, n, &ret, memory_bar);
|
|
return ret;
|
|
|
|
// CPU style execution
|
|
// Chunk left, right;
|
|
// left = hash_many(data, first, first+n/2);
|
|
// right = hash_many(data, first+n/2, last);
|
|
// Chunk parent(left.flags, left.key);
|
|
// parent.flags |= PARENT;
|
|
// memcpy(parent.data, left.raw_hash, 32);
|
|
// memcpy(parent.data+8, right.raw_hash, 32);
|
|
// parent.compress_chunk();
|
|
// return parent;
|
|
}
|
|
|
|
Chunk merge(Chunk &left, Chunk &right);
|
|
void hash_root(Chunk &node, vector<u8> &out_slice);
|
|
|
|
struct Hasher {
|
|
u32 key[8];
|
|
u32 flags;
|
|
u64 ctr;
|
|
u64 file_size;
|
|
// A memory bar for CUDA to use during it's computation
|
|
Chunk* memory_bar;
|
|
// Factory is an array of FACTORY_HT possible SNICKER bars
|
|
thrust_vector factory[FACTORY_HT];
|
|
|
|
// methods
|
|
static Hasher new_internal(u32 key[8], u32 flags, u64 fsize);
|
|
static Hasher _new(u64);
|
|
// initializes cuda memory (if needed)
|
|
void init();
|
|
// frees cuda memory (if it is there)
|
|
// free nullptr is a no-op
|
|
~Hasher() {
|
|
if(memory_bar)
|
|
cudaFree(memory_bar);
|
|
else
|
|
free(memory_bar);
|
|
}
|
|
|
|
void update(char *input, int size);
|
|
void finalize(vector<u8> &out_slice);
|
|
void propagate();
|
|
};
|
|
|
|
Hasher Hasher::new_internal(u32 key[8], u32 flags, u64 fsize) {
|
|
return Hasher{
|
|
{
|
|
key[0], key[1], key[2], key[3],
|
|
key[4], key[5], key[6], key[7]
|
|
},
|
|
flags,
|
|
0, // counter
|
|
fsize
|
|
};
|
|
}
|
|
|
|
Hasher Hasher::_new(u64 fsize) { return new_internal(IV, 0, fsize); }
|
|
|
|
void Hasher::init() {
|
|
if(file_size<1) {
|
|
memory_bar = nullptr;
|
|
return;
|
|
}
|
|
u64 num_chunks = ceil(file_size / CHUNK_LEN);
|
|
u32 bar_size = min(num_chunks, (u64)SNICKER);
|
|
// Just for safety :)
|
|
++bar_size;
|
|
cudaMalloc(&memory_bar, bar_size*sizeof(Chunk));
|
|
|
|
// Let the most commonly used places always have memory
|
|
// +1 so that it does not resize when it hits CHUNK_LEN
|
|
u32 RESERVE = SNICKER + 1;
|
|
factory[0].reserve(RESERVE);
|
|
factory[1].reserve(RESERVE);
|
|
}
|
|
|
|
void Hasher::propagate() {
|
|
int level=0;
|
|
// nodes move to upper levels if lower one is one SNICKER long
|
|
while(factory[level].size() == SNICKER) {
|
|
Chunk subtree = hash_many(factory[level].data(), 0, SNICKER, memory_bar);
|
|
factory[level].clear();
|
|
++level;
|
|
factory[level].push_back(subtree);
|
|
}
|
|
}
|
|
|
|
void Hasher::update(char *input, int size) {
|
|
factory[0].push_back(Chunk(input, size, flags, key, ctr));
|
|
++ctr;
|
|
if(factory[0].size() == SNICKER)
|
|
propagate();
|
|
}
|
|
|
|
void Hasher::finalize(vector<u8> &out_slice) {
|
|
Chunk root(flags, key);
|
|
for(int i=0; i<FACTORY_HT; i++) {
|
|
vector<Chunk> subtrees;
|
|
u32 n = factory[i].size(), divider=SNICKER;
|
|
if(!n)
|
|
continue;
|
|
int start = 0;
|
|
while(divider) {
|
|
if(n÷r) {
|
|
Chunk subtree = hash_many(factory[i].data(), start, start+divider, memory_bar);
|
|
subtrees.push_back(subtree);
|
|
start += divider;
|
|
}
|
|
divider >>= 1;
|
|
}
|
|
while(subtrees.size()>1) {
|
|
Chunk tmp1 = subtrees.back();
|
|
subtrees.pop_back();
|
|
Chunk tmp2 = subtrees.back();
|
|
subtrees.pop_back();
|
|
// tmp2 is the left child
|
|
// tmp1 is the right child
|
|
// that's the order they appear within the array
|
|
Chunk tmp = merge(tmp2, tmp1);
|
|
subtrees.push_back(tmp);
|
|
}
|
|
if(i<FACTORY_HT-1)
|
|
factory[i+1].push_back(subtrees[0]);
|
|
else
|
|
root = subtrees[0];
|
|
}
|
|
hash_root(root, out_slice);
|
|
}
|
|
|
|
Chunk merge(Chunk &left, Chunk &right) {
|
|
// cout << "Called merge once\n";
|
|
left.compress_chunk();
|
|
right.compress_chunk();
|
|
|
|
Chunk parent(left.flags, left.key);
|
|
parent.flags |= PARENT;
|
|
// 32 bytes need to be copied for all of these
|
|
memcpy(parent.data, left.raw_hash, 32);
|
|
memcpy(parent.data+8, right.raw_hash, 32);
|
|
return parent;
|
|
}
|
|
|
|
void hash_root(Chunk &node, vector<u8> &out_slice) {
|
|
// the last message block must not be hashed like the others
|
|
// it needs to be hashed with the root flag
|
|
u64 output_block_counter = 0;
|
|
u64 i=0, k=2*OUT_LEN;
|
|
|
|
u32 words[16] = {};
|
|
for(; int(out_slice.size()-i)>0; i+=k) {
|
|
node.counter = output_block_counter;
|
|
node.compress_chunk(ROOT);
|
|
|
|
// words is u32[16]
|
|
memcpy(words, node.raw_hash, 16*sizeof(*words));
|
|
|
|
vector<u8> out_block(min(k, (u64)out_slice.size()-i));
|
|
for(u32 l=0; l<out_block.size(); l+=4) {
|
|
for(u32 j=0; j<min(4U, (u32)out_block.size()-l); j++)
|
|
out_block[l+j] = (words[l/4]>>(8*j)) & 0x000000FF;
|
|
}
|
|
|
|
for(u32 j=0; j<out_block.size(); j++)
|
|
out_slice[i+j] = out_block[j];
|
|
|
|
++output_block_counter;
|
|
}
|
|
} |