#if defined(__AVX2__) // sha3-4way.c // 19-Nov-11 Markku-Juhani O. Saarinen // vectorization by JayDDee 2021-03-27 // // Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3" // Revised 03-Sep-15 for portability + OpenSSL - style API #include "sha3-4way.h" // constants static const uint64_t keccakf_rndc[24] = { 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 }; void sha3_4way_keccakf( __m256i st[25] ) { int i, j, r; __m256i t, bc[5]; for ( r = 0; r < KECCAKF_ROUNDS; r++ ) { // Theta bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) ); bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) ); bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) ); bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) ); bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) ); for ( i = 0; i < 5; i++ ) { t = _mm256_xor_si256( bc[ (i+4) % 5 ], mm256_rol_64( bc[ (i+1) % 5 ], 1 ) ); st[ i ] = _mm256_xor_si256( st[ i ], t ); st[ i+5 ] = _mm256_xor_si256( st[ i+ 5 ], t ); st[ i+10 ] = _mm256_xor_si256( st[ i+10 ], t ); st[ i+15 ] = _mm256_xor_si256( st[ i+15 ], t ); st[ i+20 ] = _mm256_xor_si256( st[ i+20 ], t ); } // Rho Pi #define RHO_PI( i, c ) \ bc[0] = st[ i ]; \ st[ i ] = mm256_rol_64( t, c ); \ t = bc[0] t = st[1]; RHO_PI( 10, 1 ); RHO_PI( 7, 3 ); RHO_PI( 11, 6 ); RHO_PI( 17, 10 ); RHO_PI( 18, 15 ); RHO_PI( 3, 21 ); RHO_PI( 5, 28 ); RHO_PI( 16, 36 ); RHO_PI( 8, 45 ); RHO_PI( 21, 55 ); RHO_PI( 24, 2 ); RHO_PI( 4, 14 ); RHO_PI( 15, 27 ); RHO_PI( 23, 41 ); RHO_PI( 19, 56 ); RHO_PI( 13, 8 ); RHO_PI( 12, 25 ); RHO_PI( 2, 43 ); RHO_PI( 20, 62 ); RHO_PI( 14, 18 ); RHO_PI( 22, 39 ); RHO_PI( 9, 61 ); RHO_PI( 6, 20 ); RHO_PI( 1, 44 ); #undef RHO_PI // Chi for ( j = 0; j < 25; j += 5 ) { bc[0] = st[j]; bc[1] = st[j+1]; st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] ); st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] ); st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] ); st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] ); st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] ); } // Iota st[0] = _mm256_xor_si256( st[0], _mm256_set1_epi64x( keccakf_rndc[ r ] ) ); } } int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ) { for ( int i = 0; i < 25; i++ ) c->st[ i ] = m256_zero; c->mdlen = mdlen; c->rsiz = 200 - 2 * mdlen; c->pt = 0; return 1; } int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len ) { size_t i; int j = c->pt; const int rsiz = c->rsiz / 8; const int l = len / 8; for ( i = 0; i < l; i++ ) { c->st[ j ] = _mm256_xor_si256( c->st[ j ], ( (const __m256i*)data )[i] ); j++; if ( j >= rsiz ) { sha3_4way_keccakf( c->st ); j = 0; } } c->pt = j; return 1; } int sha3_4way_final( void *md, sha3_4way_ctx_t *c ) { c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ], m256_const1_64( 6 ) ); c->st[ c->rsiz / 8 - 1 ] = _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ], m256_const1_64( 0x8000000000000000 ) ); sha3_4way_keccakf( c->st ); memcpy( md, c->st, c->mdlen * 4 ); return 1; } void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen ) { sha3_4way_ctx_t ctx; sha3_4way_init( &ctx, mdlen); sha3_4way_update( &ctx, in, inlen ); sha3_4way_final( md, &ctx ); return md; } #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) void sha3_8way_keccakf( __m512i st[25] ) { int i, j, r; __m512i t, bc[5]; // actual iteration for ( r = 0; r < KECCAKF_ROUNDS; r++ ) { // Theta for ( i = 0; i < 5; i++ ) bc[i] = _mm512_xor_si512( st[i], mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) ); for ( i = 0; i < 5; i++ ) { t = _mm512_xor_si512( bc[(i + 4) % 5], _mm512_rol_epi64( bc[(i + 1) % 5], 1 ) ); for ( j = 0; j < 25; j += 5 ) st[j + i] = _mm512_xor_si512( st[j + i], t ); } // Rho Pi #define RHO_PI( i, c ) \ bc[0] = st[ i ]; \ st[ i ] = _mm512_rol_epi64( t, c ); \ t = bc[0] t = st[1]; RHO_PI( 10, 1 ); RHO_PI( 7, 3 ); RHO_PI( 11, 6 ); RHO_PI( 17, 10 ); RHO_PI( 18, 15 ); RHO_PI( 3, 21 ); RHO_PI( 5, 28 ); RHO_PI( 16, 36 ); RHO_PI( 8, 45 ); RHO_PI( 21, 55 ); RHO_PI( 24, 2 ); RHO_PI( 4, 14 ); RHO_PI( 15, 27 ); RHO_PI( 23, 41 ); RHO_PI( 19, 56 ); RHO_PI( 13, 8 ); RHO_PI( 12, 25 ); RHO_PI( 2, 43 ); RHO_PI( 20, 62 ); RHO_PI( 14, 18 ); RHO_PI( 22, 39 ); RHO_PI( 9, 61 ); RHO_PI( 6, 20 ); RHO_PI( 1, 44 ); #undef RHO_PI // Chi for ( j = 0; j < 25; j += 5 ) { for ( i = 0; i < 5; i++ ) bc[i] = st[j + i]; for ( i = 0; i < 5; i++ ) st[ j+i ] = _mm512_xor_si512( st[ j+i ], _mm512_andnot_si512( bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) ); } // Iota st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) ); } } // Initialize the context for SHA3 int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ) { for ( int i = 0; i < 25; i++ ) c->st[ i ] = m512_zero; c->mdlen = mdlen; c->rsiz = 200 - 2 * mdlen; c->pt = 0; return 1; } // update state with more data int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len ) { size_t i; int j = c->pt; const int rsiz = c->rsiz / 8; const int l = len / 8; for ( i = 0; i < l; i++ ) { c->st[ j ] = _mm512_xor_si512( c->st[ j ], ( (const __m512i*)data )[i] ); j++; if ( j >= rsiz ) { sha3_8way_keccakf( c->st ); j = 0; } } c->pt = j; return 1; } // finalize and output a hash int sha3_8way_final( void *md, sha3_8way_ctx_t *c ) { c->st[ c->pt ] = _mm512_xor_si512( c->st[ c->pt ], m512_const1_64( 6 ) ); c->st[ c->rsiz / 8 - 1 ] = _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ], m512_const1_64( 0x8000000000000000 ) ); sha3_8way_keccakf( c->st ); memcpy( md, c->st, c->mdlen * 8 ); return 1; } // compute a SHA-3 hash (md) of given byte length from "in" void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen ) { sha3_8way_ctx_t sha3; sha3_8way_init( &sha3, mdlen); sha3_8way_update( &sha3, in, inlen ); sha3_8way_final( md, &sha3 ); return md; } #endif // AVX512 #endif // AVX2