/* hash.c Aug 2011 * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12. * * Groestl implementation for different versions. * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer * * This code is placed in the public domain */ // Optimized for hash and data length that are integrals of __m128i #include #include "groestl512-intr-4way.h" #include "miner.h" #include "simd-utils.h" #if defined(__AVX2__) && defined(__VAES__) #if defined(SIMD512) int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) { memset_zero_512( ctx->chaining, SIZE512 ); memset_zero_512( ctx->buffer, SIZE512 ); // The only non-zero in the IV is len. It can be hard coded. ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 ); ctx->buf_ptr = 0; ctx->rem_ptr = 0; return 0; } int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, const void* input, uint64_t databitlen ) { const int len = (int)databitlen / 128; const int hashlen_m128i = 64 / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; uint64_t blocks = len / SIZE512; __m512i* in = (__m512i*)input; int i; // --- update --- for ( i = 0; i < blocks; i++ ) TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; i += rem; //--- final --- blocks++; // adjust for final block if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 ); } else { ctx->buffer[i] = mm512_bcast128lo_64( 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m512_zero; ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 ); } TF1024_4way( ctx->chaining, ctx->buffer ); OF1024_4way( ctx->chaining ); for ( i = 0; i < hashlen_m128i; i++ ) casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; return 0; } int groestl512_4way_full( groestl512_4way_context* ctx, void* output, const void* input, uint64_t datalen ) { const int len = (int)datalen >> 4; const int hashlen_m128i = 64 >> 4; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; uint64_t blocks = len / SIZE512; __m512i* in = (__m512i*)input; int i; // --- init --- memset_zero_512( ctx->chaining, SIZE512 ); memset_zero_512( ctx->buffer, SIZE512 ); ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 ); ctx->buf_ptr = 0; // --- update --- for ( i = 0; i < blocks; i++ ) TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ i ] = in[ ctx->buf_ptr + i ]; // --- close --- blocks++; if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 ); } else { ctx->buffer[i] = mm512_bcast128lo_64( 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m512_zero; ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 ); } TF1024_4way( ctx->chaining, ctx->buffer ); OF1024_4way( ctx->chaining ); for ( i = 0; i < hashlen_m128i; i++ ) casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; return 0; } #endif // AVX512 // AVX2 + VAES int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen ) { memset_zero_256( ctx->chaining, SIZE512 ); memset_zero_256( ctx->buffer, SIZE512 ); // The only non-zero in the IV is len. It can be hard coded. ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 ); ctx->buf_ptr = 0; ctx->rem_ptr = 0; return 0; } int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output, const void* input, uint64_t databitlen ) { const int len = (int)databitlen / 128; const int hashlen_m128i = 64 / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; uint64_t blocks = len / SIZE512; __m256i* in = (__m256i*)input; int i; // --- update --- for ( i = 0; i < blocks; i++ ) TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; i += rem; //--- final --- blocks++; // adjust for final block if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 ); } else { ctx->buffer[i] = mm256_bcast128lo_64( 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m256_zero; ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 ); } TF1024_2way( ctx->chaining, ctx->buffer ); OF1024_2way( ctx->chaining ); for ( i = 0; i < hashlen_m128i; i++ ) casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; return 0; } int groestl512_2way_full( groestl512_2way_context* ctx, void* output, const void* input, uint64_t datalen ) { const int len = (int)datalen >> 4; const int hashlen_m128i = 64 >> 4; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; uint64_t blocks = len / SIZE512; __m256i* in = (__m256i*)input; int i; // --- init --- memset_zero_256( ctx->chaining, SIZE512 ); memset_zero_256( ctx->buffer, SIZE512 ); ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 ); ctx->buf_ptr = 0; // --- update --- for ( i = 0; i < blocks; i++ ) TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ i ] = in[ ctx->buf_ptr + i ]; // --- close --- blocks++; if ( i == SIZE512 - 1 ) { // only 1 vector left in buffer, all padding at once ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 ); } else { ctx->buffer[i] = mm256_bcast128lo_64( 0x80 ); for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m256_zero; ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 ); } TF1024_2way( ctx->chaining, ctx->buffer ); OF1024_2way( ctx->chaining ); for ( i = 0; i < hashlen_m128i; i++ ) casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; return 0; } #endif // VAES