diff --git a/README.txt b/README.txt index edaff7d..5d50a87 100644 --- a/README.txt +++ b/README.txt @@ -1,6 +1,10 @@ This file is included in the Windows binary package. Compile instructions for Linux and Windows can be found in RELEASE_NOTES. +This package is officially avalable only from: + https://github.com/JayDDee/cpuminer-opt +No other sources should be trusted. + cpuminer is a console program that is executed from a DOS or Powershell prompt. There is no GUI and no mouse support. @@ -31,20 +35,22 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures -Exe file name Compile flags Arch name +Exe file name Compile flags Arch name cpuminer-sse2.exe "-msse2" Core2, Nehalem -cpuminer-aes-sse42.exe "-march=westmere" Westmere +cpuminer-aes-sse42.exe "-marxh=westmere" Westmere cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge -cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell* +cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1) cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X -cpuminer-zen.exe "-march=znver1" AMD Ryzen, Threadripper -cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake* +cpuminer-zen.exe "-march=znver1" Zen1, Zen2 +cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(2) +cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake(3) -* Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. -Icelake is only available on some laptops. Mining with a laptop is not -recommended. The icelake build is included in anticipation of Intel eventually -releasing a desktop CPU with a microarchitecture newer than Skylake. +(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. +(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is + available. Zen2 CPUs should use Zen build. +(3) Icelake is only available on some laptops. Mining with a laptop is not +recommended. Notes about included DLL files: diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 1c4d406..106d499 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,12 @@ If not what makes it happen or not happen? Change Log ---------- +v3.15.2 + +Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x, +allium. +Zen3 build added to Windows binary package. + v3.15.1 Fix compile on AMD Zen3 CPUs with VAES. diff --git a/algo-gate-api.h b/algo-gate-api.h index 80aa3b4..af29ecb 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -90,10 +90,11 @@ typedef uint32_t set_t; #define AES_OPT 2 #define SSE42_OPT 4 #define AVX_OPT 8 // Sandybridge -#define AVX2_OPT 0x10 // Haswell -#define SHA_OPT 0x20 // sha256 (Ryzen, Ice Lake) -#define AVX512_OPT 0x40 // AVX512- F, VL, DQ, BW (Skylake-X) -#define VAES_OPT 0x80 // VAES (Ice Lake) +#define AVX2_OPT 0x10 // Haswell, Zen1 +#define SHA_OPT 0x20 // Zen1, Icelake (sha256) +#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW]) +#define VAES_OPT 0x80 // Icelake (VAES & AVX512) +#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512) // return set containing all elements from sets a & b @@ -111,9 +112,9 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; } typedef struct { // Mandatory functions, one of these is mandatory. If a generic scanhash -// is used a custom hash function must be registered, with a custom scanhash -// the custom hash function can be called directly and doesn't need to be -// registered in the gate. +// is used a custom target hash function must be registered, with a custom +// scanhash the target hash function can be called directly and doesn't need +// to be registered in the gate. int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* ); int ( *hash ) ( void*, const void*, int ); diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index 57c0a94..eb3c41c 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -1,5 +1,4 @@ -//#if 0 -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) #include "simd-utils.h" #include "echo-hash-4way.h" @@ -13,8 +12,12 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = */ // do these need to be reversed? +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + #define mul2mask \ - _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) + m512_const2_64( 0, 0x00001b00 ) +//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) // _mm512_set4_epi32( 0x00001b00, 0, 0, 0 ) #define lsbmask m512_const1_32( 0x01010101 ) @@ -30,87 +33,87 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = const int j2 = ( (j)+2 ) & 3; \ const int j3 = ( (j)+3 ) & 3; \ s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \ - t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \ - t1 = _mm512_and_si512( t1, lsbmask );\ - t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ - s2 = _mm512_xor_si512( s2, t2 ); \ - state2[ 0 ] [j ] = s2; \ - state2[ 1 ] [j ] = state1[ 0 ][ j ]; \ - state2[ 2 ] [j ] = state1[ 0 ][ j ]; \ - state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\ - s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \ - t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \ - t1 = _mm512_and_si512( t1, lsbmask ); \ - t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ - s2 = _mm512_xor_si512( s2, t2 );\ - state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \ - _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \ - state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \ - state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ - state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ - s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \ - t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \ - t1 = _mm512_and_si512( t1, lsbmask ); \ - t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ - s2 = _mm512_xor_si512( s2, t2 ); \ - state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ - state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ + t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \ + t1 = _mm512_and_si512( t1, lsbmask );\ + t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ + s2 = _mm512_xor_si512( s2, t2 ); \ + state2[ 0 ] [j ] = s2; \ + state2[ 1 ] [j ] = state1[ 0 ][ j ]; \ + state2[ 2 ] [j ] = state1[ 0 ][ j ]; \ + state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\ + s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \ + t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \ + t1 = _mm512_and_si512( t1, lsbmask ); \ + t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ + s2 = _mm512_xor_si512( s2, t2 );\ + state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \ + _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \ + state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \ + state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ + state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ + s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \ + t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \ + t1 = _mm512_and_si512( t1, lsbmask ); \ + t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ + s2 = _mm512_xor_si512( s2, t2 ); \ + state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ + state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \ - state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ - state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ - s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ - t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \ - t1 = _mm512_and_si512( t1, lsbmask ); \ - t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ - s2 = _mm512_xor_si512( s2, t2 ); \ - state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ - state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ - state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ + state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ + state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ + s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ + t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \ + t1 = _mm512_and_si512( t1, lsbmask ); \ + t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ + s2 = _mm512_xor_si512( s2, t2 ); \ + state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ + state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ + state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \ - state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ + state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ } while(0) #define ECHO_ROUND_UNROLL2 \ - ECHO_SUBBYTES(_state, 0, 0);\ + ECHO_SUBBYTES(_state, 0, 0);\ ECHO_SUBBYTES(_state, 1, 0);\ - ECHO_SUBBYTES(_state, 2, 0);\ - ECHO_SUBBYTES(_state, 3, 0);\ - ECHO_SUBBYTES(_state, 0, 1);\ - ECHO_SUBBYTES(_state, 1, 1);\ - ECHO_SUBBYTES(_state, 2, 1);\ - ECHO_SUBBYTES(_state, 3, 1);\ - ECHO_SUBBYTES(_state, 0, 2);\ - ECHO_SUBBYTES(_state, 1, 2);\ - ECHO_SUBBYTES(_state, 2, 2);\ - ECHO_SUBBYTES(_state, 3, 2);\ - ECHO_SUBBYTES(_state, 0, 3);\ - ECHO_SUBBYTES(_state, 1, 3);\ - ECHO_SUBBYTES(_state, 2, 3);\ - ECHO_SUBBYTES(_state, 3, 3);\ - ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ - ECHO_SUBBYTES(_state2, 0, 0);\ - ECHO_SUBBYTES(_state2, 1, 0);\ - ECHO_SUBBYTES(_state2, 2, 0);\ - ECHO_SUBBYTES(_state2, 3, 0);\ - ECHO_SUBBYTES(_state2, 0, 1);\ - ECHO_SUBBYTES(_state2, 1, 1);\ - ECHO_SUBBYTES(_state2, 2, 1);\ - ECHO_SUBBYTES(_state2, 3, 1);\ - ECHO_SUBBYTES(_state2, 0, 2);\ - ECHO_SUBBYTES(_state2, 1, 2);\ - ECHO_SUBBYTES(_state2, 2, 2);\ - ECHO_SUBBYTES(_state2, 3, 2);\ - ECHO_SUBBYTES(_state2, 0, 3);\ - ECHO_SUBBYTES(_state2, 1, 3);\ - ECHO_SUBBYTES(_state2, 2, 3);\ - ECHO_SUBBYTES(_state2, 3, 3);\ - ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + ECHO_SUBBYTES(_state, 2, 0);\ + ECHO_SUBBYTES(_state, 3, 0);\ + ECHO_SUBBYTES(_state, 0, 1);\ + ECHO_SUBBYTES(_state, 1, 1);\ + ECHO_SUBBYTES(_state, 2, 1);\ + ECHO_SUBBYTES(_state, 3, 1);\ + ECHO_SUBBYTES(_state, 0, 2);\ + ECHO_SUBBYTES(_state, 1, 2);\ + ECHO_SUBBYTES(_state, 2, 2);\ + ECHO_SUBBYTES(_state, 3, 2);\ + ECHO_SUBBYTES(_state, 0, 3);\ + ECHO_SUBBYTES(_state, 1, 3);\ + ECHO_SUBBYTES(_state, 2, 3);\ + ECHO_SUBBYTES(_state, 3, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES(_state2, 0, 0);\ + ECHO_SUBBYTES(_state2, 1, 0);\ + ECHO_SUBBYTES(_state2, 2, 0);\ + ECHO_SUBBYTES(_state2, 3, 0);\ + ECHO_SUBBYTES(_state2, 0, 1);\ + ECHO_SUBBYTES(_state2, 1, 1);\ + ECHO_SUBBYTES(_state2, 2, 1);\ + ECHO_SUBBYTES(_state2, 3, 1);\ + ECHO_SUBBYTES(_state2, 0, 2);\ + ECHO_SUBBYTES(_state2, 1, 2);\ + ECHO_SUBBYTES(_state2, 2, 2);\ + ECHO_SUBBYTES(_state2, 3, 2);\ + ECHO_SUBBYTES(_state2, 0, 3);\ + ECHO_SUBBYTES(_state2, 1, 3);\ + ECHO_SUBBYTES(_state2, 2, 3);\ + ECHO_SUBBYTES(_state2, 3, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) #define SAVESTATE(dst, src)\ dst[0][0] = src[0][0];\ @@ -224,43 +227,43 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg, int echo_4way_init( echo_4way_context *ctx, int nHashSize ) { - int i, j; + int i, j; ctx->k = m512_zero; - ctx->processed_bits = 0; - ctx->uBufferBytes = 0; + ctx->processed_bits = 0; + ctx->uBufferBytes = 0; - switch( nHashSize ) - { - case 256: - ctx->uHashSize = 256; - ctx->uBlockLength = 192; - ctx->uRounds = 8; - ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 ); - ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 ); - break; + switch( nHashSize ) + { + case 256: + ctx->uHashSize = 256; + ctx->uBlockLength = 192; + ctx->uRounds = 8; + ctx->hashsize = m512_const2_64( 0, 0x100 ); + ctx->const1536 = m512_const2_64( 0, 0x600 ); + break; - case 512: - ctx->uHashSize = 512; - ctx->uBlockLength = 128; - ctx->uRounds = 10; - ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 ); - ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400); - break; + case 512: + ctx->uHashSize = 512; + ctx->uBlockLength = 128; + ctx->uRounds = 10; + ctx->hashsize = m512_const2_64( 0, 0x200 ); + ctx->const1536 = m512_const2_64( 0, 0x400); + break; - default: - return 1; - } + default: + return 1; + } - for( i = 0; i < 4; i++ ) - for( j = 0; j < nHashSize / 256; j++ ) - ctx->state[ i ][ j ] = ctx->hashsize; + for( i = 0; i < 4; i++ ) + for( j = 0; j < nHashSize / 256; j++ ) + ctx->state[ i ][ j ] = ctx->hashsize; - for( i = 0; i < 4; i++ ) - for( j = nHashSize / 256; j < 4; j++ ) - ctx->state[ i ][ j ] = m512_zero; + for( i = 0; i < 4; i++ ) + for( j = nHashSize / 256; j < 4; j++ ) + ctx->state[ i ][ j ] = m512_zero; - return 0; + return 0; } int echo_4way_update_close( echo_4way_context *state, void *hashval, @@ -285,17 +288,13 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval, vlen = databitlen / 128; // * 4 lanes / 128 bits per lane memcpy_512( state->buffer, data, vlen ); state->processed_bits += (unsigned int)( databitlen ); - remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen ); - + remainingbits = m512_const2_64( 0, (uint64_t)databitlen ); } - state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); + state->buffer[ vlen ] = m512_const2_64( 0, 0x80 ); memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 ); - state->buffer[ vblen-2 ] = - _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 ); - state->buffer[ vblen-1 ] = - _mm512_set4_epi64( 0, state->processed_bits, - 0, state->processed_bits ); + state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 ); + state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits); state->k = _mm512_add_epi64( state->k, remainingbits ); state->k = _mm512_sub_epi64( state->k, state->const1536 ); @@ -328,16 +327,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, ctx->uHashSize = 256; ctx->uBlockLength = 192; ctx->uRounds = 8; - ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 ); - ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 ); + ctx->hashsize = m512_const2_64( 0, 0x100 ); + ctx->const1536 = m512_const2_64( 0, 0x600 ); break; case 512: ctx->uHashSize = 512; ctx->uBlockLength = 128; ctx->uRounds = 10; - ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 ); - ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400); + ctx->hashsize = m512_const2_64( 0, 0x200 ); + ctx->const1536 = m512_const2_64( 0, 0x400 ); break; default: @@ -372,17 +371,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, vlen = databitlen / 128; // * 4 lanes / 128 bits per lane memcpy_512( ctx->buffer, data, vlen ); ctx->processed_bits += (unsigned int)( databitlen ); - remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen ); - + remainingbits = m512_const2_64( 0, databitlen ); } - ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); + ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 ); memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 ); ctx->buffer[ vblen-2 ] = - _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 ); - ctx->buffer[ vblen-1 ] = - _mm512_set4_epi64( 0, ctx->processed_bits, - 0, ctx->processed_bits ); + m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 ); + ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits); ctx->k = _mm512_add_epi64( ctx->k, remainingbits ); ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 ); @@ -400,5 +396,380 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, return 0; } +#endif // AVX512 -#endif +// AVX2 + VAES + +#define mul2mask_2way m256_const2_64( 0, 0x0000000000001b00 ) + +#define lsbmask_2way m256_const1_32( 0x01010101 ) + +#define ECHO_SUBBYTES_2WAY( state, i, j ) \ + state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \ + state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \ + k1 = _mm256_add_epi32( k1, m256_one_128 ); + +#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \ +{ \ + const int j1 = ( (j)+1 ) & 3; \ + const int j2 = ( (j)+2 ) & 3; \ + const int j3 = ( (j)+3 ) & 3; \ + s2 = _mm256_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \ + t1 = _mm256_srli_epi16( state1[ 0 ][ j ], 7 ); \ + t1 = _mm256_and_si256( t1, lsbmask_2way );\ + t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \ + s2 = _mm256_xor_si256( s2, t2 ); \ + state2[ 0 ] [j ] = s2; \ + state2[ 1 ] [j ] = state1[ 0 ][ j ]; \ + state2[ 2 ] [j ] = state1[ 0 ][ j ]; \ + state2[ 3 ] [j ] = _mm256_xor_si256( s2, state1[ 0 ][ j ] );\ + s2 = _mm256_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \ + t1 = _mm256_srli_epi16( state1[ 1 ][ j1 ], 7 ); \ + t1 = _mm256_and_si256( t1, lsbmask_2way ); \ + t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \ + s2 = _mm256_xor_si256( s2, t2 );\ + state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \ + _mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \ + state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \ + state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \ + state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \ + s2 = _mm256_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \ + t1 = _mm256_srli_epi16( state1[ 2 ][ j2 ], 7 ); \ + t1 = _mm256_and_si256( t1, lsbmask_2way ); \ + t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \ + s2 = _mm256_xor_si256( s2, t2 ); \ + state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ + state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \ + _mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \ + state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \ + state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ + s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ + t1 = _mm256_srli_epi16( state1[ 3 ][ j3 ], 7 ); \ + t1 = _mm256_and_si256( t1, lsbmask_2way ); \ + t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \ + s2 = _mm256_xor_si256( s2, t2 ); \ + state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \ + state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ + state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \ + _mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \ + state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \ +} while(0) + +#define ECHO_ROUND_UNROLL2_2WAY \ + ECHO_SUBBYTES_2WAY(_state, 0, 0);\ + ECHO_SUBBYTES_2WAY(_state, 1, 0);\ + ECHO_SUBBYTES_2WAY(_state, 2, 0);\ + ECHO_SUBBYTES_2WAY(_state, 3, 0);\ + ECHO_SUBBYTES_2WAY(_state, 0, 1);\ + ECHO_SUBBYTES_2WAY(_state, 1, 1);\ + ECHO_SUBBYTES_2WAY(_state, 2, 1);\ + ECHO_SUBBYTES_2WAY(_state, 3, 1);\ + ECHO_SUBBYTES_2WAY(_state, 0, 2);\ + ECHO_SUBBYTES_2WAY(_state, 1, 2);\ + ECHO_SUBBYTES_2WAY(_state, 2, 2);\ + ECHO_SUBBYTES_2WAY(_state, 3, 2);\ + ECHO_SUBBYTES_2WAY(_state, 0, 3);\ + ECHO_SUBBYTES_2WAY(_state, 1, 3);\ + ECHO_SUBBYTES_2WAY(_state, 2, 3);\ + ECHO_SUBBYTES_2WAY(_state, 3, 3);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES_2WAY(_state2, 0, 0);\ + ECHO_SUBBYTES_2WAY(_state2, 1, 0);\ + ECHO_SUBBYTES_2WAY(_state2, 2, 0);\ + ECHO_SUBBYTES_2WAY(_state2, 3, 0);\ + ECHO_SUBBYTES_2WAY(_state2, 0, 1);\ + ECHO_SUBBYTES_2WAY(_state2, 1, 1);\ + ECHO_SUBBYTES_2WAY(_state2, 2, 1);\ + ECHO_SUBBYTES_2WAY(_state2, 3, 1);\ + ECHO_SUBBYTES_2WAY(_state2, 0, 2);\ + ECHO_SUBBYTES_2WAY(_state2, 1, 2);\ + ECHO_SUBBYTES_2WAY(_state2, 2, 2);\ + ECHO_SUBBYTES_2WAY(_state2, 3, 2);\ + ECHO_SUBBYTES_2WAY(_state2, 0, 3);\ + ECHO_SUBBYTES_2WAY(_state2, 1, 3);\ + ECHO_SUBBYTES_2WAY(_state2, 2, 3);\ + ECHO_SUBBYTES_2WAY(_state2, 3, 3);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2) + +#define SAVESTATE_2WAY(dst, src)\ + dst[0][0] = src[0][0];\ + dst[0][1] = src[0][1];\ + dst[0][2] = src[0][2];\ + dst[0][3] = src[0][3];\ + dst[1][0] = src[1][0];\ + dst[1][1] = src[1][1];\ + dst[1][2] = src[1][2];\ + dst[1][3] = src[1][3];\ + dst[2][0] = src[2][0];\ + dst[2][1] = src[2][1];\ + dst[2][2] = src[2][2];\ + dst[2][3] = src[2][3];\ + dst[3][0] = src[3][0];\ + dst[3][1] = src[3][1];\ + dst[3][2] = src[3][2];\ + dst[3][3] = src[3][3] + +// blockcount always 1 +void echo_2way_compress( echo_2way_context *ctx, const __m256i *pmsg, + unsigned int uBlockCount ) +{ + unsigned int r, b, i, j; + __m256i t1, t2, s2, k1; + __m256i _state[4][4], _state2[4][4], _statebackup[4][4]; + + _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ]; + _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ]; + _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ]; + _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ]; + _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ]; + _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ]; + _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ]; + _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ]; + _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ]; + _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ]; + _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ]; + _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ]; + _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ]; + _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ]; + _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ]; + _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ]; + + for ( b = 0; b < uBlockCount; b++ ) + { + ctx->k = _mm256_add_epi64( ctx->k, ctx->const1536 ); + + for( j = ctx->uHashSize / 256; j < 4; j++ ) + { + for ( i = 0; i < 4; i++ ) + { + _state[ i ][ j ] = _mm256_load_si256( + pmsg + 4 * (j - (ctx->uHashSize / 256)) + i ); + } + } + + // save state + SAVESTATE_2WAY( _statebackup, _state ); + + k1 = ctx->k; + + for ( r = 0; r < ctx->uRounds / 2; r++ ) + { + ECHO_ROUND_UNROLL2_2WAY; + } + + if ( ctx->uHashSize == 256 ) + { + for ( i = 0; i < 4; i++ ) + { + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _state[ i ][ 1 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _state[ i ][ 2 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _state[ i ][ 3 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _statebackup[ i ][ 0 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _statebackup[ i ][ 1 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _statebackup[ i ][ 2 ] ) ; + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _statebackup[ i ][ 3 ] ); + } + } + else + { + for ( i = 0; i < 4; i++ ) + { + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _state[ i ][ 2 ] ); + _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ], + _state[ i ][ 3 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ], + _statebackup[ i ][ 0 ] ); + _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ] [0 ], + _statebackup[ i ][ 2 ] ); + _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ], + _statebackup[ i ][ 1 ] ); + _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ], + _statebackup[ i ][ 3 ] ); + } + } + pmsg += ctx->uBlockLength; + } + SAVESTATE_2WAY(ctx->state, _state); + +} +int echo_2way_init( echo_2way_context *ctx, int nHashSize ) +{ + int i, j; + + ctx->k = m256_zero; + ctx->processed_bits = 0; + ctx->uBufferBytes = 0; + + switch( nHashSize ) + { + case 256: + ctx->uHashSize = 256; + ctx->uBlockLength = 192; + ctx->uRounds = 8; + ctx->hashsize = m256_const2_64( 0, 0x100 ); + ctx->const1536 = m256_const2_64( 0, 0x600 ); + break; + + case 512: + ctx->uHashSize = 512; + ctx->uBlockLength = 128; + ctx->uRounds = 10; + ctx->hashsize = m256_const2_64( 0, 0x200 ); + ctx->const1536 = m256_const2_64( 0, 0x400 ); + break; + + default: + return 1; + } + + for( i = 0; i < 4; i++ ) + for( j = 0; j < nHashSize / 256; j++ ) + ctx->state[ i ][ j ] = ctx->hashsize; + + for( i = 0; i < 4; i++ ) + for( j = nHashSize / 256; j < 4; j++ ) + ctx->state[ i ][ j ] = m256_zero; + + return 0; +} + +int echo_2way_update_close( echo_2way_context *state, void *hashval, + const void *data, int databitlen ) +{ +// bytelen is either 32 (maybe), 64 or 80 or 128! +// all are less than full block. + + int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane + const int vblen = state->uBlockLength / 16; // 16 bytes per lane + __m256i remainingbits; + + if ( databitlen == 1024 ) + { + echo_2way_compress( state, data, 1 ); + state->processed_bits = 1024; + remainingbits = m256_const2_64( 0, -1024 ); + vlen = 0; + } + else + { + memcpy_256( state->buffer, data, vlen ); + state->processed_bits += (unsigned int)( databitlen ); + remainingbits = m256_const2_64( 0, databitlen ); + } + + state->buffer[ vlen ] = m256_const2_64( 0, 0x80 ); + memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 ); + state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 ); + state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits ); + + state->k = _mm256_add_epi64( state->k, remainingbits ); + state->k = _mm256_sub_epi64( state->k, state->const1536 ); + + echo_2way_compress( state, state->buffer, 1 ); + + _mm256_store_si256( (__m256i*)hashval + 0, state->state[ 0 ][ 0] ); + _mm256_store_si256( (__m256i*)hashval + 1, state->state[ 1 ][ 0] ); + + if ( state->uHashSize == 512 ) + { + _mm256_store_si256( (__m256i*)hashval + 2, state->state[ 2 ][ 0 ] ); + _mm256_store_si256( (__m256i*)hashval + 3, state->state[ 3 ][ 0 ] ); + } + return 0; +} + +int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize, + const void *data, int datalen ) +{ + int i, j; + int databitlen = datalen * 8; + ctx->k = m256_zero; + ctx->processed_bits = 0; + ctx->uBufferBytes = 0; + + switch( nHashSize ) + { + case 256: + ctx->uHashSize = 256; + ctx->uBlockLength = 192; + ctx->uRounds = 8; + ctx->hashsize = m256_const2_64( 0, 0x100 ); + ctx->const1536 = m256_const2_64( 0, 0x600 ); + break; + + case 512: + ctx->uHashSize = 512; + ctx->uBlockLength = 128; + ctx->uRounds = 10; + ctx->hashsize = m256_const2_64( 0, 0x200 ); + ctx->const1536 = m256_const2_64( 0, 0x400 ); + break; + + default: + return 1; + } + + for( i = 0; i < 4; i++ ) + for( j = 0; j < nHashSize / 256; j++ ) + ctx->state[ i ][ j ] = ctx->hashsize; + + for( i = 0; i < 4; i++ ) + for( j = nHashSize / 256; j < 4; j++ ) + ctx->state[ i ][ j ] = m256_zero; + + int vlen = datalen / 32; + const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane + __m256i remainingbits; + + if ( databitlen == 1024 ) + { + echo_2way_compress( ctx, data, 1 ); + ctx->processed_bits = 1024; + remainingbits = m256_const2_64( 0, -1024 ); + vlen = 0; + } + else + { + vlen = databitlen / 128; // * 4 lanes / 128 bits per lane + memcpy_256( ctx->buffer, data, vlen ); + ctx->processed_bits += (unsigned int)( databitlen ); + remainingbits = m256_const2_64( 0, databitlen ); + } + + ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 ); + memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 ); + ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 ); + ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits ); + + ctx->k = _mm256_add_epi64( ctx->k, remainingbits ); + ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 ); + + echo_2way_compress( ctx, ctx->buffer, 1 ); + + _mm256_store_si256( (__m256i*)hashval + 0, ctx->state[ 0 ][ 0] ); + _mm256_store_si256( (__m256i*)hashval + 1, ctx->state[ 1 ][ 0] ); + + if ( ctx->uHashSize == 512 ) + { + _mm256_store_si256( (__m256i*)hashval + 2, ctx->state[ 2 ][ 0 ] ); + _mm256_store_si256( (__m256i*)hashval + 3, ctx->state[ 3 ][ 0 ] ); + } + return 0; +} + + +#endif // VAES diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h index f9e906f..5808685 100644 --- a/algo/echo/echo-hash-4way.h +++ b/algo/echo/echo-hash-4way.h @@ -1,10 +1,12 @@ #if !defined(ECHO_HASH_4WAY_H__) #define ECHO_HASH_4WAY_H__ 1 -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__VAES__) #include "simd-utils.h" +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { __m512i state[4][4]; @@ -20,6 +22,7 @@ typedef struct unsigned int processed_bits; } echo_4way_context __attribute__ ((aligned (64))); +#define echo512_4way_context echo_4way_context int echo_4way_init( echo_4way_context *state, int hashbitlen ); #define echo512_4way_init( state ) echo_4way_init( state, 512 ) @@ -29,8 +32,8 @@ int echo_4way_update( echo_4way_context *state, const void *data, unsigned int databitlen); #define echo512_4way_update echo_4way_update -int echo_close( echo_4way_context *state, void *hashval ); -#define echo512_4way_close echo_4way_close +// int echo_4way_close( echo_4way_context *state, void *hashval ); +// #define echo512_4way_close echo_4way_close int echo_4way_update_close( echo_4way_context *state, void *hashval, const void *data, int databitlen ); @@ -43,5 +46,45 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, #define echo256_4way_full( state, hashval, data, datalen ) \ echo_4way_full( state, hashval, 256, data, datalen ) -#endif -#endif +#endif // AVX512 + +typedef struct +{ + __m256i state[4][4]; + __m256i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes + __m256i k; + __m256i hashsize; + __m256i const1536; + + unsigned int uRounds; + unsigned int uHashSize; + unsigned int uBlockLength; + unsigned int uBufferBytes; + unsigned int processed_bits; + +} echo_2way_context __attribute__ ((aligned (64))); +#define echo512_2way_context echo_2way_context + +int echo_2way_init( echo_2way_context *state, int hashbitlen ); +#define echo512_2way_init( state ) echo_2way_init( state, 512 ) +#define echo256_2way_init( state ) echo_2way_init( state, 256 ) + +int echo_2way_update( echo_2way_context *state, const void *data, + unsigned int databitlen); +#define echo512_2way_update echo_2way_update + +int echo_2way_update_close( echo_2way_context *state, void *hashval, + const void *data, int databitlen ); +#define echo512_2way_update_close echo_2way_update_close + +int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize, + const void *data, int datalen ); +#define echo512_2way_full( state, hashval, data, datalen ) \ + echo_2way_full( state, hashval, 512, data, datalen ) +#define echo256_2way_full( state, hashval, data, datalen ) \ + echo_2way_full( state, hashval, 256, data, datalen ) + + +#endif // VAES + +#endif // ECHO_HASH_4WAY_H__ diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c index ef296fd..dd82a86 100644 --- a/algo/groestl/groestl256-hash-4way.c +++ b/algo/groestl/groestl256-hash-4way.c @@ -15,7 +15,9 @@ #include "miner.h" #include "simd-utils.h" -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX2__) && defined(__VAES__) + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen ) @@ -43,10 +45,10 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen ) } int groestl256_4way_full( groestl256_4way_context* ctx, void* output, - const void* input, uint64_t databitlen ) + const void* input, uint64_t datalen ) { - const int len = (int)databitlen / 128; - const int hashlen_m128i = 32 / 16; // bytes to __m128i + const int len = (int)datalen >> 4; + const int hashlen_m128i = 32 >> 4; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; int blocks = len / SIZE256; @@ -172,5 +174,161 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output, return 0; } -#endif // VAES +#endif // AVX512 +// AVX2 + VAES + +int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen ) +{ + int i; + + ctx->hashlen = hashlen; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return 1; + + for ( i = 0; i < SIZE256; i++ ) + { + ctx->chaining[i] = m256_zero; + ctx->buffer[i] = m256_zero; + } + + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 ); + + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + return 0; +} + +int groestl256_2way_full( groestl256_2way_context* ctx, void* output, + const void* input, uint64_t datalen ) +{ + const int len = (int)datalen >> 4; + const int hashlen_m128i = 32 >> 4; // bytes to __m128i + const int hash_offset = SIZE256 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE256; + __m256i* in = (__m256i*)input; + int i; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return 1; + + for ( i = 0; i < SIZE256; i++ ) + { + ctx->chaining[i] = m256_zero; + ctx->buffer[i] = m256_zero; + } + + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF512_2way( ctx->chaining, &in[ i * SIZE256 ] ); + ctx->buf_ptr = blocks * SIZE256; + + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + + //--- final --- + + blocks++; // adjust for final block + + if ( i == SIZE256 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 ); + } + else + { + // add first padding + ctx->buffer[i] = m256_const2_64( 0, 0x80 ); + // add zero padding + for ( i += 1; i < SIZE256 - 1; i++ ) + ctx->buffer[i] = m256_zero; + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 ); + } + +// digest final padding block and do output transform + TF512_2way( ctx->chaining, ctx->buffer ); + + OF512_2way( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} +int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output, + const void* input, uint64_t databitlen ) +{ + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE256 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE256; + __m256i* in = (__m256i*)input; + int i; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF512_2way( ctx->chaining, &in[ i * SIZE256 ] ); + ctx->buf_ptr = blocks * SIZE256; + + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + + //--- final --- + + blocks++; // adjust for final block + + if ( i == SIZE256 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m256_const1_128( _mm_set_epi8( + blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + } + else + { + // add first padding + ctx->buffer[i] = m256_const2_64( 0, 0x80 ); + // add zero padding + for ( i += 1; i < SIZE256 - 1; i++ ) + ctx->buffer[i] = m256_zero; + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = m256_const1_128( _mm_set_epi8( + blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + } + +// digest final padding block and do output transform + TF512_2way( ctx->chaining, ctx->buffer ); + + OF512_2way( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + +#endif // VAES diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h index 907a64b..59c6270 100644 --- a/algo/groestl/groestl256-hash-4way.h +++ b/algo/groestl/groestl256-hash-4way.h @@ -18,8 +18,8 @@ #endif #include -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - +#if defined(__AVX2__) && defined(__VAES__) + #define LENGTH (256) //#include "brg_endian.h" @@ -48,6 +48,8 @@ #define SIZE256 (SIZE_512/16) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { __attribute__ ((aligned (128))) __m512i chaining[SIZE256]; __attribute__ ((aligned (64))) __m512i buffer[SIZE256]; @@ -55,7 +57,7 @@ typedef struct { int blk_count; // SIZE_m128i int buf_ptr; // __m128i offset int rem_ptr; - int databitlen; // bits +// int databitlen; // bits } groestl256_4way_context; @@ -74,5 +76,25 @@ int groestl256_4way_update_close( groestl256_4way_context*, void*, int groestl256_4way_full( groestl256_4way_context*, void*, const void*, uint64_t ); -#endif -#endif +#endif // AVX512 + +typedef struct { + __attribute__ ((aligned (128))) __m256i chaining[SIZE256]; + __attribute__ ((aligned (64))) __m256i buffer[SIZE256]; + int hashlen; // byte + int blk_count; // SIZE_m128i + int buf_ptr; // __m128i offset + int rem_ptr; +// int databitlen; // bits +} groestl256_2way_context; + +int groestl256_2way_init( groestl256_2way_context*, uint64_t ); + +int groestl256_2way_update_close( groestl256_2way_context*, void*, + const void*, uint64_t ); + +int groestl256_2way_full( groestl256_2way_context*, void*, + const void*, uint64_t ); + +#endif // VAES +#endif // GROESTL256_HASH_4WAY_H__ diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 8175f74..25d9171 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -12,7 +12,7 @@ #include "groestl256-hash-4way.h" -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX2__) && defined(__VAES__) static const __m128i round_const_l0[] __attribute__ ((aligned (64))) = { @@ -42,6 +42,8 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) = { 0x0000000000000000, 0x8696a6b6c6d6e6f6 } }; +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02, 0x1d1519111c141810, 0x1f171b131e161a12, 0x2d2529212c242820, 0x2f272b232e262a22, @@ -499,5 +501,398 @@ void OF512_4way( __m512i* chaining ) chaining[3] = xmm11; } +#endif // AVX512 + +static const __m256i TRANSP_MASK_2WAY = + { 0x0d0509010c040800, 0x0f070b030e060a02, + 0x1d1519111c141810, 0x1f171b131e161a12 }; + +static const __m256i SUBSH_MASK0_2WAY = + { 0x0c0f0104070b0e00, 0x03060a0d08020509, + 0x1c1f1114171b1e10, 0x13161a1d18121519 }; + +static const __m256i SUBSH_MASK1_2WAY = + { 0x0e090205000d0801, 0x04070c0f0a03060b, + 0x1e191215101d1801, 0x14171c1f1a13161b }; + +static const __m256i SUBSH_MASK2_2WAY = + { 0x080b0306010f0a02, 0x05000e090c04070d, + 0x181b1316111f1a12, 0x15101e191c14171d }; + +static const __m256i SUBSH_MASK3_2WAY = + { 0x0a0d040702090c03, 0x0601080b0e05000f, + 0x1a1d141712191c13, 0x1611181b1e15101f }; + +static const __m256i SUBSH_MASK4_2WAY = + { 0x0b0e0500030a0d04, 0x0702090c0f060108, + 0x1b1e1510131a1d14, 0x1712191c1f161118 }; + +static const __m256i SUBSH_MASK5_2WAY = + { 0x0d080601040c0f05, 0x00030b0e0907020a, + 0x1d181611141c1f15, 0x10131b1e1917121a }; + +static const __m256i SUBSH_MASK6_2WAY = + { 0x0f0a0702050e0906, 0x01040d080b00030c, + 0x1f1a1712151e1916, 0x11141d181b10131c }; + +static const __m256i SUBSH_MASK7_2WAY = + { 0x090c000306080b07, 0x02050f0a0d01040e, + 0x191c101316181b17, 0x12151f1a1d11141e, }; + +#define tos(a) #a +#define tostr(a) tos(a) + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2_2WAY(i, j, k){\ + j = _mm256_xor_si256(j, j);\ + j = _mm256_cmpgt_epi8(j, i );\ + i = _mm256_add_epi8(i, i);\ + j = _mm256_and_si256(j, k);\ + i = _mm256_xor_si256(i, j);\ +} + +#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm256_xor_si256(a0, a1);\ + b0 = a2;\ + a1 = _mm256_xor_si256(a1, a2);\ + b1 = a3;\ + a2 = _mm256_xor_si256(a2, a3);\ + b2 = a4;\ + a3 = _mm256_xor_si256(a3, a4);\ + b3 = a5;\ + a4 = _mm256_xor_si256(a4, a5);\ + b4 = a6;\ + a5 = _mm256_xor_si256(a5, a6);\ + b5 = a7;\ + a6 = _mm256_xor_si256(a6, a7);\ + a7 = _mm256_xor_si256(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm256_xor_si256(b0, a4);\ + b6 = _mm256_xor_si256(b6, a4);\ + b1 = _mm256_xor_si256(b1, a5);\ + b7 = _mm256_xor_si256(b7, a5);\ + b2 = _mm256_xor_si256(b2, a6);\ + b0 = _mm256_xor_si256(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm256_xor_si256(b3, a7);\ + b1 = _mm256_xor_si256(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm256_xor_si256(b4, a0);\ + b2 = _mm256_xor_si256(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm256_xor_si256(b5, a1);\ + b3 = _mm256_xor_si256(b3, a1);\ + b1 = a1;\ + b6 = _mm256_xor_si256(b6, a2);\ + b4 = _mm256_xor_si256(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm256_xor_si256(b7, a3);\ + b5 = _mm256_xor_si256(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm256_xor_si256(a0, a3);\ + a1 = _mm256_xor_si256(a1, a4);\ + a2 = _mm256_xor_si256(a2, a5);\ + a3 = _mm256_xor_si256(a3, a6);\ + a4 = _mm256_xor_si256(a4, a7);\ + a5 = _mm256_xor_si256(a5, b0);\ + a6 = _mm256_xor_si256(a6, b1);\ + a7 = _mm256_xor_si256(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2_2WAY(a0, b0, b1);\ + a0 = _mm256_xor_si256(a0, TEMP0);\ + MUL2_2WAY(a1, b0, b1);\ + a1 = _mm256_xor_si256(a1, TEMP1);\ + MUL2_2WAY(a2, b0, b1);\ + a2 = _mm256_xor_si256(a2, b2);\ + MUL2_2WAY(a3, b0, b1);\ + a3 = _mm256_xor_si256(a3, b3);\ + MUL2_2WAY(a4, b0, b1);\ + a4 = _mm256_xor_si256(a4, b4);\ + MUL2_2WAY(a5, b0, b1);\ + a5 = _mm256_xor_si256(a5, b5);\ + MUL2_2WAY(a6, b0, b1);\ + a6 = _mm256_xor_si256(a6, b6);\ + MUL2_2WAY(a7, b0, b1);\ + a7 = _mm256_xor_si256(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2_2WAY(a0, b0, b1);\ + b5 = _mm256_xor_si256(b5, a0);\ + MUL2_2WAY(a1, b0, b1);\ + b6 = _mm256_xor_si256(b6, a1);\ + MUL2_2WAY(a2, b0, b1);\ + b7 = _mm256_xor_si256(b7, a2);\ + MUL2_2WAY(a5, b0, b1);\ + b2 = _mm256_xor_si256(b2, a5);\ + MUL2_2WAY(a6, b0, b1);\ + b3 = _mm256_xor_si256(b3, a6);\ + MUL2_2WAY(a7, b0, b1);\ + b4 = _mm256_xor_si256(b4, a7);\ + MUL2_2WAY(a3, b0, b1);\ + MUL2_2WAY(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm256_xor_si256(b0, a3);\ + b1 = _mm256_xor_si256(b1, a4);\ +}/*MixBytes*/ + +#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \ + a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\ + a1 = _mm256_xor_si256( a1, b1 );\ + a2 = _mm256_xor_si256( a2, b1 );\ + a3 = _mm256_xor_si256( a3, b1 );\ + a4 = _mm256_xor_si256( a4, b1 );\ + a5 = _mm256_xor_si256( a5, b1 );\ + a6 = _mm256_xor_si256( a6, b1 );\ + a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm256_xor_si256( b0, b0 );\ + a0 = _mm256_shuffle_epi8( a0, SUBSH_MASK0_2WAY );\ + a0 = _mm256_aesenclast_epi128(a0, b0 );\ + a1 = _mm256_shuffle_epi8( a1, SUBSH_MASK1_2WAY );\ + a1 = _mm256_aesenclast_epi128(a1, b0 );\ + a2 = _mm256_shuffle_epi8( a2, SUBSH_MASK2_2WAY );\ + a2 = _mm256_aesenclast_epi128(a2, b0 );\ + a3 = _mm256_shuffle_epi8( a3, SUBSH_MASK3_2WAY );\ + a3 = _mm256_aesenclast_epi128(a3, b0 );\ + a4 = _mm256_shuffle_epi8( a4, SUBSH_MASK4_2WAY );\ + a4 = _mm256_aesenclast_epi128(a4, b0 );\ + a5 = _mm256_shuffle_epi8( a5, SUBSH_MASK5_2WAY );\ + a5 = _mm256_aesenclast_epi128(a5, b0 );\ + a6 = _mm256_shuffle_epi8( a6, SUBSH_MASK6_2WAY );\ + a6 = _mm256_aesenclast_epi128(a6, b0 );\ + a7 = _mm256_shuffle_epi8( a7, SUBSH_MASK7_2WAY );\ + a7 = _mm256_aesenclast_epi128( a7, b0 );\ + \ + /* MixBytes */\ + MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q_2WAY(){\ + ROUND_2WAY(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND_2WAY(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND_2WAY(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND_2WAY(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND_2WAY(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND_2WAY(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND_2WAY(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND_2WAY(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND_2WAY(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND_2WAY(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +#define Matrix_Transpose_A_2way(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK_2WAY;\ + \ + i0 = _mm256_shuffle_epi8( i0, t0 );\ + i1 = _mm256_shuffle_epi8( i1, t0 );\ + i2 = _mm256_shuffle_epi8( i2, t0 );\ + i3 = _mm256_shuffle_epi8( i3, t0 );\ + \ + o1 = i0;\ + t0 = i2;\ + \ + i0 = _mm256_unpacklo_epi16( i0, i1 );\ + o1 = _mm256_unpackhi_epi16( o1, i1 );\ + i2 = _mm256_unpacklo_epi16( i2, i3 );\ + t0 = _mm256_unpackhi_epi16( t0, i3 );\ + \ + i0 = _mm256_shuffle_epi32( i0, 216 );\ + o1 = _mm256_shuffle_epi32( o1, 216 );\ + i2 = _mm256_shuffle_epi32( i2, 216 );\ + t0 = _mm256_shuffle_epi32( t0, 216 );\ + \ + o2 = i0;\ + o3 = o1;\ + \ + i0 = _mm256_unpacklo_epi32( i0, i2 );\ + o1 = _mm256_unpacklo_epi32( o1, t0 );\ + o2 = _mm256_unpackhi_epi32( o2, i2 );\ + o3 = _mm256_unpackhi_epi32( o3, t0 );\ +}/**/ + +#define Matrix_Transpose_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm256_unpacklo_epi64( i0, i4 );\ + o1 = _mm256_unpackhi_epi64( o1, i4 );\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm256_unpacklo_epi64( o2, i5 );\ + o3 = _mm256_unpackhi_epi64( o3, i5 );\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm256_unpacklo_epi64( o4, i6 );\ + o5 = _mm256_unpackhi_epi64( o5, i6 );\ + o7 = i3;\ + o6 = _mm256_unpacklo_epi64( o6, i7 );\ + o7 = _mm256_unpackhi_epi64( o7, i7 );\ +}/**/ + +#define Matrix_Transpose_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm256_unpacklo_epi64( i0, i1 );\ + o0 = _mm256_unpackhi_epi64( o0, i1 );\ + o1 = i2;\ + i2 = _mm256_unpacklo_epi64( i2, i3 );\ + o1 = _mm256_unpackhi_epi64( o1, i3 );\ + o2 = i4;\ + i4 = _mm256_unpacklo_epi64( i4, i5 );\ + o2 = _mm256_unpackhi_epi64( o2, i5 );\ + o3 = i6;\ + i6 = _mm256_unpacklo_epi64( i6, i7 );\ + o3 = _mm256_unpackhi_epi64( o3, i7 );\ +}/**/ + +#define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm256_xor_si256( t0, t0 );\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm256_unpacklo_epi64( i0, t0 );\ + i1 = _mm256_unpackhi_epi64( i1, t0 );\ + i2 = _mm256_unpacklo_epi64( i2, t0 );\ + i3 = _mm256_unpackhi_epi64( i3, t0 );\ + i4 = _mm256_unpacklo_epi64( i4, t0 );\ + i5 = _mm256_unpackhi_epi64( i5, t0 );\ + i6 = _mm256_unpacklo_epi64( i6, t0 );\ + i7 = _mm256_unpackhi_epi64( i7, t0 );\ +}/**/ + +#define Matrix_Transpose_O_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm256_unpacklo_epi64( i0, i1 );\ + i2 = _mm256_unpacklo_epi64( i2, i3 );\ + i4 = _mm256_unpacklo_epi64( i4, i5 );\ + i6 = _mm256_unpacklo_epi64( i6, i7 );\ +}/**/ + +void TF512_2way( __m256i* chaining, __m256i* message ) +{ + static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m256i TEMP0; + static __m256i TEMP1; + static __m256i TEMP2; + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A_2way(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm256_xor_si256( xmm8, xmm12 ); + xmm0 = _mm256_xor_si256( xmm0, xmm2 ); + xmm4 = _mm256_xor_si256( xmm4, xmm6 ); + xmm5 = _mm256_xor_si256( xmm5, xmm7 ); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B_2way(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q_2WAY(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm256_xor_si256( xmm0, xmm8 ); + xmm1 = _mm256_xor_si256( xmm1, xmm10 ); + xmm2 = _mm256_xor_si256( xmm2, xmm12 ); + xmm3 = _mm256_xor_si256( xmm3, xmm14 ); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm256_xor_si256( xmm0, (chaining[0]) ); + xmm1 = _mm256_xor_si256( xmm1, (chaining[1]) ); + xmm2 = _mm256_xor_si256( xmm2, (chaining[2]) ); + xmm3 = _mm256_xor_si256( xmm3, (chaining[3]) ); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + + return; +} + +void OF512_2way( __m256i* chaining ) +{ + static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m256i TEMP0; + static __m256i TEMP1; + static __m256i TEMP2; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q_2WAY(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) ); + xmm10 = _mm256_xor_si256( xmm10, (chaining[1]) ); + xmm12 = _mm256_xor_si256( xmm12, (chaining[2]) ); + xmm14 = _mm256_xor_si256( xmm14, (chaining[3]) ); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A_2way(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + #endif // VAES -#endif // GROESTL512_INTR_4WAY_H__ +#endif // GROESTL256_INTR_4WAY_H__ diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c index 8e5e139..bff6af5 100644 --- a/algo/groestl/groestl512-hash-4way.c +++ b/algo/groestl/groestl512-hash-4way.c @@ -15,7 +15,9 @@ #include "miner.h" #include "simd-utils.h" -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX2__) && defined(__VAES__) + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) { @@ -137,5 +139,130 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output, return 0; } +#endif // AVX512 + + +// AVX2 + VAES + +int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen ) +{ + if (ctx->chaining == NULL || ctx->buffer == NULL) + return 1; + + memset_zero_256( ctx->chaining, SIZE512 ); + memset_zero_256( ctx->buffer, SIZE512 ); + + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 ); + + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + return 0; +} + +int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output, + const void* input, uint64_t databitlen ) +{ + const int len = (int)databitlen / 128; + const int hashlen_m128i = 64 / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE512; + __m256i* in = (__m256i*)input; + int i; + + // --- update --- + + for ( i = 0; i < blocks; i++ ) + TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; + + //--- final --- + + blocks++; // adjust for final block + + if ( i == SIZE512 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m256_const1_128( _mm_set_epi8( + blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + } + else + { + ctx->buffer[i] = m256_const2_64( 0, 0x80 ); + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = m256_zero; + ctx->buffer[i] = m256_const1_128( _mm_set_epi8( + blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + } + + TF1024_2way( ctx->chaining, ctx->buffer ); + OF1024_2way( ctx->chaining ); + + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + +int groestl512_2way_full( groestl512_2way_context* ctx, void* output, + const void* input, uint64_t datalen ) +{ + const int len = (int)datalen >> 4; + const int hashlen_m128i = 64 >> 4; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + uint64_t blocks = len / SIZE512; + __m256i* in = (__m256i*)input; + int i; + + // --- init --- + + memset_zero_256( ctx->chaining, SIZE512 ); + memset_zero_256( ctx->buffer, SIZE512 ); + ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + // --- update --- + + for ( i = 0; i < blocks; i++ ) + TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ]; + i += ctx->rem_ptr; + + // --- close --- + + blocks++; + + if ( i == SIZE512 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 ); + } + else + { + ctx->buffer[i] = m256_const2_64( 0, 0x80 ); + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = m256_zero; + ctx->buffer[i] = m256_const2_64( blocks << 56, 0 ); + } + + TF1024_2way( ctx->chaining, ctx->buffer ); + OF1024_2way( ctx->chaining ); + + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + #endif // VAES diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h index 68ac7e5..7025428 100644 --- a/algo/groestl/groestl512-hash-4way.h +++ b/algo/groestl/groestl512-hash-4way.h @@ -10,7 +10,7 @@ #endif #include -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX2__) && defined(__VAES__) #define LENGTH (512) @@ -36,20 +36,19 @@ #define SIZE512 (SIZE_1024/16) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { __attribute__ ((aligned (128))) __m512i chaining[SIZE512]; __attribute__ ((aligned (64))) __m512i buffer[SIZE512]; int blk_count; // SIZE_m128i int buf_ptr; // __m128i offset int rem_ptr; - int databitlen; // bits } groestl512_4way_context; int groestl512_4way_init( groestl512_4way_context*, uint64_t ); -//int reinit_groestl( hashState_groestl* ); - int groestl512_4way_update( groestl512_4way_context*, const void*, uint64_t ); int groestl512_4way_close( groestl512_4way_context*, void* ); @@ -58,5 +57,29 @@ int groestl512_4way_update_close( groestl512_4way_context*, void*, int groestl512_4way_full( groestl512_4way_context*, void*, const void*, uint64_t ); +#endif // AVX512 + +// AVX2 + VAES + +typedef struct { + __attribute__ ((aligned (128))) __m256i chaining[SIZE512]; + __attribute__ ((aligned (64))) __m256i buffer[SIZE512]; + int blk_count; // SIZE_m128i + int buf_ptr; // __m128i offset + int rem_ptr; +} groestl512_2way_context; + + +int groestl512_2way_init( groestl512_2way_context*, uint64_t ); + +int groestl512_2way_update( groestl512_2way_context*, const void*, + uint64_t ); +int groestl512_2way_close( groestl512_2way_context*, void* ); +int groestl512_2way_update_close( groestl512_2way_context*, void*, + const void*, uint64_t ); +int groestl512_2way_full( groestl512_2way_context*, void*, + const void*, uint64_t ); + + #endif // VAES #endif // GROESTL512_HASH_4WAY_H__ diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h index 96788f4..5d8d715 100644 --- a/algo/groestl/groestl512-intr-4way.h +++ b/algo/groestl/groestl512-intr-4way.h @@ -12,7 +12,7 @@ #include "groestl512-hash-4way.h" -#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__AVX2__) && defined(__VAES__) static const __m128i round_const_p[] __attribute__ ((aligned (64))) = { @@ -50,6 +50,8 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) = { 0x8292a2b2c2d2e2f2, 0x0212223242526272 } }; +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02, 0x1d1519111c141810, 0x1f171b131e161a12, 0x2d2529212c242820, 0x2f272b232e262a22, @@ -660,5 +662,578 @@ void OF1024_4way( __m512i* chaining ) return; } +#endif // AVX512 + +// AVX2 + VAES + +static const __m256i TRANSP_MASK_2WAY = + { 0x0d0509010c040800, 0x0f070b030e060a02, + 0x1d1519111c141810, 0x1f171b131e161a12 }; + +static const __m256i SUBSH_MASK0_2WAY = + { 0x0b0e0104070a0d00, 0x0306090c0f020508, + 0x1b1e1114171a1d10, 0x1316191c1f121518 }; + +static const __m256i SUBSH_MASK1_2WAY = + { 0x0c0f0205080b0e01, 0x04070a0d00030609, + 0x1c1f1215181b1e11, 0x14171a1d10131619 }; + +static const __m256i SUBSH_MASK2_2WAY = + { 0x0d000306090c0f02, 0x05080b0e0104070a, + 0x1d101316191c1f12, 0x15181b1e1114171a }; + +static const __m256i SUBSH_MASK3_2WAY = + { 0x0e0104070a0d0003, 0x06090c0f0205080b, + 0x1e1114171a1d1013, 0x16191c1f1215181b }; + +static const __m256i SUBSH_MASK4_2WAY = + { 0x0f0205080b0e0104, 0x070a0d000306090c, + 0x1f1215181b1e1114, 0x171a1d101316191c }; + +static const __m256i SUBSH_MASK5_2WAY = + { 0x000306090c0f0205, 0x080b0e0104070a0d, + 0x101316191c1f1215, 0x181b1e1114171a1d }; + +static const __m256i SUBSH_MASK6_2WAY = + { 0x0104070a0d000306, 0x090c0f0205080b0e, + 0x1114171a1d101316, 0x191c1f1215181b1e }; + +static const __m256i SUBSH_MASK7_2WAY = + { 0x06090c0f0205080b, 0x0e0104070a0d0003, + 0x16191c1f1215181b, 0x1e1114171a1d1013 }; + +#define tos(a) #a +#define tostr(a) tos(a) + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2_2WAY(i, j, k){\ + j = _mm256_xor_si256(j, j);\ + j = _mm256_cmpgt_epi8(j, i );\ + i = _mm256_add_epi8(i, i);\ + j = _mm256_and_si256(j, k);\ + i = _mm256_xor_si256(i, j);\ +} + +#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm256_xor_si256(a0, a1);\ + b0 = a2;\ + a1 = _mm256_xor_si256(a1, a2);\ + b1 = a3;\ + a2 = _mm256_xor_si256(a2, a3);\ + b2 = a4;\ + a3 = _mm256_xor_si256(a3, a4);\ + b3 = a5;\ + a4 = _mm256_xor_si256(a4, a5);\ + b4 = a6;\ + a5 = _mm256_xor_si256(a5, a6);\ + b5 = a7;\ + a6 = _mm256_xor_si256(a6, a7);\ + a7 = _mm256_xor_si256(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm256_xor_si256(b0, a4);\ + b6 = _mm256_xor_si256(b6, a4);\ + b1 = _mm256_xor_si256(b1, a5);\ + b7 = _mm256_xor_si256(b7, a5);\ + b2 = _mm256_xor_si256(b2, a6);\ + b0 = _mm256_xor_si256(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm256_xor_si256(b3, a7);\ + b1 = _mm256_xor_si256(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm256_xor_si256(b4, a0);\ + b2 = _mm256_xor_si256(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm256_xor_si256(b5, a1);\ + b3 = _mm256_xor_si256(b3, a1);\ + b1 = a1;\ + b6 = _mm256_xor_si256(b6, a2);\ + b4 = _mm256_xor_si256(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm256_xor_si256(b7, a3);\ + b5 = _mm256_xor_si256(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm256_xor_si256(a0, a3);\ + a1 = _mm256_xor_si256(a1, a4);\ + a2 = _mm256_xor_si256(a2, a5);\ + a3 = _mm256_xor_si256(a3, a6);\ + a4 = _mm256_xor_si256(a4, a7);\ + a5 = _mm256_xor_si256(a5, b0);\ + a6 = _mm256_xor_si256(a6, b1);\ + a7 = _mm256_xor_si256(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\ + MUL2_2WAY(a0, b0, b1);\ + a0 = _mm256_xor_si256(a0, TEMP0);\ + MUL2_2WAY(a1, b0, b1);\ + a1 = _mm256_xor_si256(a1, TEMP1);\ + MUL2_2WAY(a2, b0, b1);\ + a2 = _mm256_xor_si256(a2, b2);\ + MUL2_2WAY(a3, b0, b1);\ + a3 = _mm256_xor_si256(a3, b3);\ + MUL2_2WAY(a4, b0, b1);\ + a4 = _mm256_xor_si256(a4, b4);\ + MUL2_2WAY(a5, b0, b1);\ + a5 = _mm256_xor_si256(a5, b5);\ + MUL2_2WAY(a6, b0, b1);\ + a6 = _mm256_xor_si256(a6, b6);\ + MUL2_2WAY(a7, b0, b1);\ + a7 = _mm256_xor_si256(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2_2WAY(a0, b0, b1);\ + b5 = _mm256_xor_si256(b5, a0);\ + MUL2_2WAY(a1, b0, b1);\ + b6 = _mm256_xor_si256(b6, a1);\ + MUL2_2WAY(a2, b0, b1);\ + b7 = _mm256_xor_si256(b7, a2);\ + MUL2_2WAY(a5, b0, b1);\ + b2 = _mm256_xor_si256(b2, a5);\ + MUL2_2WAY(a6, b0, b1);\ + b3 = _mm256_xor_si256(b3, a6);\ + MUL2_2WAY(a7, b0, b1);\ + b4 = _mm256_xor_si256(b4, a7);\ + MUL2_2WAY(a3, b0, b1);\ + MUL2_2WAY(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm256_xor_si256(b0, a3);\ + b1 = _mm256_xor_si256(b1, a4);\ +}/*MixBytes*/ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX_2WAY(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes */\ + b0 = _mm256_xor_si256( b0, b0 );\ + a0 = _mm256_aesenclast_epi128( a0, b0 );\ + a1 = _mm256_aesenclast_epi128( a1, b0 );\ + a2 = _mm256_aesenclast_epi128( a2, b0 );\ + a3 = _mm256_aesenclast_epi128( a3, b0 );\ + a4 = _mm256_aesenclast_epi128( a4, b0 );\ + a5 = _mm256_aesenclast_epi128( a5, b0 );\ + a6 = _mm256_aesenclast_epi128( a6, b0 );\ + a7 = _mm256_aesenclast_epi128( a7, b0 );\ + /* MixBytes */\ + MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P_2WAY(){\ + uint8_t round_counter = 0;\ + for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \ + { \ + /* AddRoundConstant P1024 */\ + xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \ + casti_m128i( round_const_p, round_counter ) ) ); \ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \ + xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\ + xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK2_2WAY );\ + xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK3_2WAY );\ + xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK4_2WAY );\ + xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK5_2WAY );\ + xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK6_2WAY );\ + xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK7_2WAY );\ + /* SubBytes + MixBytes */\ + SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant P1024 */\ + xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \ + casti_m128i( round_const_p, round_counter+1 ) ) ); \ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\ + xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\ + xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK2_2WAY );\ + xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK3_2WAY );\ + xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK4_2WAY );\ + xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK5_2WAY );\ + xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK6_2WAY );\ + xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK7_2WAY );\ + /* SubBytes + MixBytes */\ + SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +#define ROUNDS_Q_2WAY(){\ + uint8_t round_counter = 0;\ + for ( round_counter = 0; round_counter < 14; round_counter += 2) \ + { \ + /* AddRoundConstant Q1024 */\ + xmm1 = m256_neg1;\ + xmm8 = _mm256_xor_si256( xmm8, xmm1 );\ + xmm9 = _mm256_xor_si256( xmm9, xmm1 );\ + xmm10 = _mm256_xor_si256( xmm10, xmm1 );\ + xmm11 = _mm256_xor_si256( xmm11, xmm1 );\ + xmm12 = _mm256_xor_si256( xmm12, xmm1 );\ + xmm13 = _mm256_xor_si256( xmm13, xmm1 );\ + xmm14 = _mm256_xor_si256( xmm14, xmm1 );\ + xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \ + casti_m128i( round_const_q, round_counter ) ) ); \ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\ + xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\ + xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK5_2WAY );\ + xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK7_2WAY );\ + xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK0_2WAY );\ + xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK2_2WAY );\ + xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK4_2WAY );\ + xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK6_2WAY );\ + /* SubBytes + MixBytes */\ + SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant Q1024 */\ + xmm9 = m256_neg1;\ + xmm0 = _mm256_xor_si256( xmm0, xmm9 );\ + xmm1 = _mm256_xor_si256( xmm1, xmm9 );\ + xmm2 = _mm256_xor_si256( xmm2, xmm9 );\ + xmm3 = _mm256_xor_si256( xmm3, xmm9 );\ + xmm4 = _mm256_xor_si256( xmm4, xmm9 );\ + xmm5 = _mm256_xor_si256( xmm5, xmm9 );\ + xmm6 = _mm256_xor_si256( xmm6, xmm9 );\ + xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \ + casti_m128i( round_const_q, round_counter+1 ) ) ); \ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\ + xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\ + xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK5_2WAY );\ + xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK7_2WAY );\ + xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK0_2WAY );\ + xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK2_2WAY );\ + xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK4_2WAY );\ + xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK6_2WAY );\ + /* SubBytes + MixBytes */\ + SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +#define Matrix_Transpose_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK_2WAY;\ +\ + i6 = _mm256_shuffle_epi8(i6, t0);\ + i0 = _mm256_shuffle_epi8(i0, t0);\ + i1 = _mm256_shuffle_epi8(i1, t0);\ + i2 = _mm256_shuffle_epi8(i2, t0);\ + i3 = _mm256_shuffle_epi8(i3, t0);\ + t1 = i2;\ + i4 = _mm256_shuffle_epi8(i4, t0);\ + i5 = _mm256_shuffle_epi8(i5, t0);\ + t2 = i4;\ + t3 = i6;\ + i7 = _mm256_shuffle_epi8(i7, t0);\ +\ + /* continue with unpack using 4 temp registers */\ + t0 = i0;\ + t2 = _mm256_unpackhi_epi16(t2, i5);\ + i4 = _mm256_unpacklo_epi16(i4, i5);\ + t3 = _mm256_unpackhi_epi16(t3, i7);\ + i6 = _mm256_unpacklo_epi16(i6, i7);\ + t0 = _mm256_unpackhi_epi16(t0, i1);\ + t1 = _mm256_unpackhi_epi16(t1, i3);\ + i2 = _mm256_unpacklo_epi16(i2, i3);\ + i0 = _mm256_unpacklo_epi16(i0, i1);\ +\ + /* shuffle with immediate */\ + t0 = _mm256_shuffle_epi32(t0, 216);\ + t1 = _mm256_shuffle_epi32(t1, 216);\ + t2 = _mm256_shuffle_epi32(t2, 216);\ + t3 = _mm256_shuffle_epi32(t3, 216);\ + i0 = _mm256_shuffle_epi32(i0, 216);\ + i2 = _mm256_shuffle_epi32(i2, 216);\ + i4 = _mm256_shuffle_epi32(i4, 216);\ + i6 = _mm256_shuffle_epi32(i6, 216);\ +\ + /* continue with unpack */\ + t4 = i0;\ + i0 = _mm256_unpacklo_epi32(i0, i2);\ + t4 = _mm256_unpackhi_epi32(t4, i2);\ + t5 = t0;\ + t0 = _mm256_unpacklo_epi32(t0, t1);\ + t5 = _mm256_unpackhi_epi32(t5, t1);\ + t6 = i4;\ + i4 = _mm256_unpacklo_epi32(i4, i6);\ + t7 = t2;\ + t6 = _mm256_unpackhi_epi32(t6, i6);\ + i2 = t0;\ + t2 = _mm256_unpacklo_epi32(t2, t3);\ + i3 = t0;\ + t7 = _mm256_unpackhi_epi32(t7, t3);\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = i0;\ + i1 = _mm256_unpackhi_epi64(i1, i4);\ + i0 = _mm256_unpacklo_epi64(i0, i4);\ + i4 = t4;\ + i3 = _mm256_unpackhi_epi64(i3, t2);\ + i5 = t4;\ + i2 = _mm256_unpacklo_epi64(i2, t2);\ + i6 = t5;\ + i5 = _mm256_unpackhi_epi64(i5, t6);\ + i7 = t5;\ + i4 = _mm256_unpacklo_epi64(i4, t6);\ + i7 = _mm256_unpackhi_epi64(i7, t7);\ + i6 = _mm256_unpacklo_epi64(i6, t7);\ + /* transpose done */\ +}/**/ + +#define Matrix_Transpose_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + o1 = i0;\ + i0 = _mm256_unpacklo_epi64(i0, i1);\ + o1 = _mm256_unpackhi_epi64(o1, i1);\ + t0 = i2;\ + i2 = _mm256_unpacklo_epi64(i2, i3);\ + t0 = _mm256_unpackhi_epi64(t0, i3);\ + t1 = i4;\ + i4 = _mm256_unpacklo_epi64(i4, i5);\ + t1 = _mm256_unpackhi_epi64(t1, i5);\ + t2 = i6;\ + o0 = TRANSP_MASK_2WAY;\ + i6 = _mm256_unpacklo_epi64(i6, i7);\ + t2 = _mm256_unpackhi_epi64(t2, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm256_shuffle_epi8(i0, o0);\ + i2 = _mm256_shuffle_epi8(i2, o0);\ + i4 = _mm256_shuffle_epi8(i4, o0);\ + i6 = _mm256_shuffle_epi8(i6, o0);\ + o1 = _mm256_shuffle_epi8(o1, o0);\ + t0 = _mm256_shuffle_epi8(t0, o0);\ + t1 = _mm256_shuffle_epi8(t1, o0);\ + t2 = _mm256_shuffle_epi8(t2, o0);\ + /* continue with unpack using 4 temp registers */\ + t3 = i4;\ + o2 = o1;\ + o0 = i0;\ + t4 = t1;\ + \ + t3 = _mm256_unpackhi_epi16(t3, i6);\ + i4 = _mm256_unpacklo_epi16(i4, i6);\ + o0 = _mm256_unpackhi_epi16(o0, i2);\ + i0 = _mm256_unpacklo_epi16(i0, i2);\ + o2 = _mm256_unpackhi_epi16(o2, t0);\ + o1 = _mm256_unpacklo_epi16(o1, t0);\ + t4 = _mm256_unpackhi_epi16(t4, t2);\ + t1 = _mm256_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm256_shuffle_epi32(i4, 216);\ + t3 = _mm256_shuffle_epi32(t3, 216);\ + o1 = _mm256_shuffle_epi32(o1, 216);\ + o2 = _mm256_shuffle_epi32(o2, 216);\ + i0 = _mm256_shuffle_epi32(i0, 216);\ + o0 = _mm256_shuffle_epi32(o0, 216);\ + t1 = _mm256_shuffle_epi32(t1, 216);\ + t4 = _mm256_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = i0;\ + i3 = o0;\ + i5 = o1;\ + i7 = o2;\ + i0 = _mm256_unpacklo_epi32(i0, i4);\ + i1 = _mm256_unpackhi_epi32(i1, i4);\ + o0 = _mm256_unpacklo_epi32(o0, t3);\ + i3 = _mm256_unpackhi_epi32(i3, t3);\ + o1 = _mm256_unpacklo_epi32(o1, t1);\ + i5 = _mm256_unpackhi_epi32(i5, t1);\ + o2 = _mm256_unpacklo_epi32(o2, t4);\ + i7 = _mm256_unpackhi_epi32(i7, t4);\ + /* transpose done */\ +}/**/ + +void INIT_2way( __m256i *chaining ) +{ + static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024_2way( __m256i *chaining, const __m256i *message ) +{ + static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m256i QTEMP[8]; + static __m256i TEMP0; + static __m256i TEMP1; + static __m256i TEMP2; + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm8 = message[0]; + xmm9 = message[1]; + xmm10 = message[2]; + xmm11 = message[3]; + xmm12 = message[4]; + xmm13 = message[5]; + xmm14 = message[6]; + xmm15 = message[7]; + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store message M (Q input) for later */ + QTEMP[0] = xmm8; + QTEMP[1] = xmm9; + QTEMP[2] = xmm10; + QTEMP[3] = xmm11; + QTEMP[4] = xmm12; + QTEMP[5] = xmm13; + QTEMP[6] = xmm14; + QTEMP[7] = xmm15; + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) ); + xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) ); + xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) ); + xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) ); + xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) ); + xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) ); + xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) ); + xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) ); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P_2WAY(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) ); + xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) ); + xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) ); + xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) ); + xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) ); + xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) ); + xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) ); + xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) ); + + /* store P(CV+M)+CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + /* load message M (Q input) into xmm8-15 */ + xmm8 = QTEMP[0]; + xmm9 = QTEMP[1]; + xmm10 = QTEMP[2]; + xmm11 = QTEMP[3]; + xmm12 = QTEMP[4]; + xmm13 = QTEMP[5]; + xmm14 = QTEMP[6]; + xmm15 = QTEMP[7]; + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q_2WAY(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) ); + xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) ); + xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) ); + xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) ); + xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) ); + xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) ); + xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) ); + xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) ); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + return; +} + +void OF1024_2way( __m256i* chaining ) +{ + static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m256i TEMP0; + static __m256i TEMP1; + static __m256i TEMP2; + + /* load CV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P_2WAY(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) ); + xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) ); + xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) ); + xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) ); + xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) ); + xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) ); + xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) ); + xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) ); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +} + + + #endif // VAES #endif // GROESTL512_INTR_4WAY_H__ diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index de8ca72..833b87e 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -174,24 +174,19 @@ void allium_16way_hash( void *state, const void *input ) #if defined(__VAES__) intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 ); - - groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 ); - + groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 ); dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 ); + intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 ); - - groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 ); - + groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 ); dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 ); + intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 ); - - groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 ); - + groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 ); dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 ); - intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 ); - groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 ); - + intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 ); + groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 ); dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 ); #else @@ -262,8 +257,11 @@ typedef struct { keccak256_4way_context keccak; cubehashParam cube; skein256_4way_context skein; +#if defined(__VAES__) + groestl256_2way_context groestl; +#else hashState_groestl256 groestl; - +#endif } allium_8way_ctx_holder; static __thread allium_8way_ctx_holder allium_8way_ctx; @@ -273,7 +271,11 @@ bool init_allium_8way_ctx() keccak256_4way_init( &allium_8way_ctx.keccak ); cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 ); skein256_4way_init( &allium_8way_ctx.skein ); +#if defined(__VAES__) + groestl256_2way_init( &allium_8way_ctx.groestl, 32 ); +#else init_groestl256( &allium_8way_ctx.groestl, 32 ); +#endif return true; } @@ -352,9 +354,28 @@ void allium_8way_hash( void *hash, const void *input ) skein256_4way_update( &ctx.skein, vhashB, 32 ); skein256_4way_close( &ctx.skein, vhashB ); +#if defined(__VAES__) + + uint64_t vhashC[4*2] __attribute__ ((aligned (64))); + uint64_t vhashD[4*2] __attribute__ ((aligned (64))); + + rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 ); + groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 ); + groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 ); + dintrlv_2x128( hash0, hash1, vhashC, 256 ); + dintrlv_2x128( hash2, hash3, vhashD, 256 ); + + rintrlv_4x64_2x128( vhashC, vhashD, vhashB, 256 ); + groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 ); + groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 ); + dintrlv_2x128( hash4, hash5, vhashC, 256 ); + dintrlv_2x128( hash6, hash7, vhashD, 256 ); + +#else + dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 ); dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 ); - + groestl256_full( &ctx.groestl, hash0, hash0, 256 ); groestl256_full( &ctx.groestl, hash1, hash1, 256 ); groestl256_full( &ctx.groestl, hash2, hash2, 256 ); @@ -363,6 +384,8 @@ void allium_8way_hash( void *hash, const void *input ) groestl256_full( &ctx.groestl, hash5, hash5, 256 ); groestl256_full( &ctx.groestl, hash6, hash6, 256 ); groestl256_full( &ctx.groestl, hash7, hash7, 256 ); + +#endif } int scanhash_allium_8way( struct work *work, uint32_t max_nonce, diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index ad62d05..c1d70e7 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -187,7 +187,8 @@ bool register_allium_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_allium; gate->hash = (void*)&allium_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT + | VAES_OPT | VAES256_OPT; opt_target_factor = 256.0; return true; }; diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index ba7f95d..15ce7db 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -3,36 +3,38 @@ bool register_sha256t_algo( algo_gate_t* gate ) { #if defined(SHA256T_8WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_sha256t_8way; gate->hash = (void*)&sha256t_8way_hash; -#elif defined(SHA256T_4WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; +#else gate->scanhash = (void*)&scanhash_sha256t_4way; gate->hash = (void*)&sha256t_4way_hash; +/* #else gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256t; gate->hash = (void*)&sha256t_hash; +*/ #endif + gate->optimizations = SSE2_OPT | AVX2_OPT; return true; } bool register_sha256q_algo( algo_gate_t* gate ) { #if defined(SHA256T_8WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_sha256q_8way; gate->hash = (void*)&sha256q_8way_hash; -#elif defined(SHA256T_4WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; +#else gate->scanhash = (void*)&scanhash_sha256q_4way; gate->hash = (void*)&sha256q_4way_hash; +/* #else gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256q; gate->hash = (void*)&sha256q_hash; +*/ #endif + gate->optimizations = SSE2_OPT | AVX2_OPT; return true; } diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 0d519aa..cb06f5a 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -4,13 +4,10 @@ #include #include "algo-gate-api.h" -// Override multi way on ryzen, SHA is better. -#if !defined(__SHA__) - #if defined(__AVX2__) +#if defined(__AVX2__) #define SHA256T_8WAY - #elif defined(__SSE2__) +#else #define SHA256T_4WAY - #endif #endif bool register_sha256t_algo( algo_gate_t* gate ); @@ -36,12 +33,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif +/* void sha256t_hash( void *output, const void *input ); int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_hash( void *output, const void *input ); int scanhash_sha256q( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - +*/ #endif diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c index 4dbfd33..19f47d6 100644 --- a/algo/sha/sha256t.c +++ b/algo/sha/sha256t.c @@ -1,5 +1,7 @@ #include "sha256t-gate.h" +// Obsolete + #if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY) #include diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index ba531ce..83f3e66 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -26,7 +26,11 @@ static const uint32_t IV512[] = static void c512_2way( shavite512_2way_context *ctx, const void *msg ) { +#if defined(__VAES__) + const __m256i zero = _mm256_setzero_si256(); +#else const __m128i zero = _mm_setzero_si128(); +#endif __m256i p0, p1, p2, p3, x; __m256i k00, k01, k02, k03, k10, k11, k12, k13; __m256i *m = (__m256i*)msg; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index abbe16a..4d12029 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -619,11 +619,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + groestl512_2way_full( &ctx.groestl, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + groestl512_2way_full( &ctx.groestl, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); - break; +#endif + break; case JH: if ( i == 0 ) jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); @@ -711,11 +720,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) } break; case SHAVITE: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + shavite512_2way_full( &ctx.shavite, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + shavite512_2way_full( &ctx.shavite, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else shavite512_full( &ctx.shavite, hash0, in0, size ); shavite512_full( &ctx.shavite, hash1, in1, size ); shavite512_full( &ctx.shavite, hash2, in2, size ); shavite512_full( &ctx.shavite, hash3, in3, size ); - break; +#endif + break; case SIMD: intrlv_2x128( vhash, in0, in1, size<<3 ); simd512_2way_full( &ctx.simd, vhash, vhash, size ); @@ -725,6 +743,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) dintrlv_2x128_512( hash2, hash3, vhash ); break; case ECHO: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + echo_2way_full( &ctx.echo, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + echo_2way_full( &ctx.echo, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else echo_full( &ctx.echo, (BitSequence *)hash0, 512, (const BitSequence *)in0, size ); echo_full( &ctx.echo, (BitSequence *)hash1, 512, @@ -733,7 +759,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) (const BitSequence *)in2, size ); echo_full( &ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)in3, size ); - break; +#endif + break; case HAMSI: if ( i == 0 ) hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 462e264..09315f6 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -61,7 +61,8 @@ bool register_x16r_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -79,7 +80,8 @@ bool register_x16rv2_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rv2; gate->hash = (void*)&x16rv2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -97,7 +99,8 @@ bool register_x16s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; @@ -230,7 +233,8 @@ bool register_x16rt_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; opt_target_factor = 256.0; return true; }; @@ -247,7 +251,8 @@ bool register_x16rt_veil_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -277,22 +282,17 @@ bool register_x21s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x21s_8way; gate->hash = (void*)&x21s_8way_hash; gate->miner_thread_init = (void*)&x21s_8way_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT - | VAES_OPT; #elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x21s_4way; gate->hash = (void*)&x21s_4way_hash; gate->miner_thread_init = (void*)&x21s_4way_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; #else gate->scanhash = (void*)&scanhash_x21s; gate->hash = (void*)&x21s_hash; gate->miner_thread_init = (void*)&x21s_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; #endif -// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index cbd3899..ed93599 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -41,6 +41,7 @@ #include "algo/sha/sha-hash-4way.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-2way.h" #include "algo/shavite/shavite-hash-4way.h" #include "algo/echo/echo-hash-4way.h" #endif @@ -145,15 +146,21 @@ union _x16r_4way_context_overlay { blake512_4way_context blake; bmw512_4way_context bmw; - hashState_echo echo; +#if defined(__VAES__) + groestl512_2way_context groestl; + shavite512_2way_context shavite; + echo_2way_context echo; +#else hashState_groestl groestl; + shavite512_context shavite; + hashState_echo echo; +#endif skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; hashState_luffa luffa1; cubehashParam cube; - shavite512_context shavite; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index e2d80da..4421636 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -672,14 +672,20 @@ union _x16rv2_4way_context_overlay { blake512_4way_context blake; bmw512_4way_context bmw; - hashState_echo echo; +#if defined(__VAES__) + groestl512_2way_context groestl; + shavite512_2way_context shavite; + echo_2way_context echo; +#else hashState_groestl groestl; + shavite512_context shavite; + hashState_echo echo; +#endif skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; cubehashParam cube; - shavite512_context shavite; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; @@ -745,10 +751,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + groestl512_2way_full( &ctx.groestl, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + groestl512_2way_full( &ctx.groestl, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); +#endif break; case JH: if ( i == 0 ) @@ -887,10 +902,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) } break; case SHAVITE: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + shavite512_2way_full( &ctx.shavite, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + shavite512_2way_full( &ctx.shavite, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else shavite512_full( &ctx.shavite, hash0, in0, size ); shavite512_full( &ctx.shavite, hash1, in1, size ); shavite512_full( &ctx.shavite, hash2, in2, size ); shavite512_full( &ctx.shavite, hash3, in3, size ); +#endif break; case SIMD: intrlv_2x128( vhash, in0, in1, size<<3 ); @@ -901,6 +925,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) dintrlv_2x128_512( hash2, hash3, vhash ); break; case ECHO: +#if defined(__VAES__) + intrlv_2x128( vhash, in0, in1, size<<3 ); + echo_2way_full( &ctx.echo, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + echo_2way_full( &ctx.echo, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); +#else echo_full( &ctx.echo, (BitSequence *)hash0, 512, (const BitSequence *)in0, size ); echo_full( &ctx.echo, (BitSequence *)hash1, 512, @@ -909,6 +941,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) (const BitSequence *)in2, size ); echo_full( &ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)in3, size ); +#endif break; case HAMSI: if ( i == 0 ) diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index e4fe98b..92deeb8 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -1124,7 +1124,13 @@ union _sonoa_4way_context_overlay { blake512_4way_context blake; bmw512_4way_context bmw; +#if defined(__VAES__) + groestl512_2way_context groestl; + echo512_2way_context echo; +#else hashState_groestl groestl; + hashState_echo echo; +#endif skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; @@ -1132,7 +1138,6 @@ union _sonoa_4way_context_overlay cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hashState_echo echo; hamsi512_4way_context hamsi; hashState_fugue fugue; shabal512_4way_context shabal; @@ -1162,6 +1167,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1171,6 +1187,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1195,6 +1213,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1206,16 +1233,29 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) (const BitSequence *)hash2, 64 ); echo_full( &ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)hash3, 64 ); + + intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); + +#endif if ( work_restart[thr_id].restart ) return 0; // 2 - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1225,6 +1265,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1249,6 +1291,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1263,6 +1314,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -1274,6 +1327,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1283,6 +1347,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1307,6 +1373,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1321,6 +1396,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -1340,6 +1417,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1349,6 +1437,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1373,6 +1463,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1387,6 +1486,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -1410,6 +1511,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) hamsi512_4way_update( &ctx.hamsi, vhashB, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); echo_full( &ctx.echo, (BitSequence *)hash0, 512, @@ -1424,6 +1534,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_2x128_512( vhashA, hash0, hash1 ); intrlv_2x128_512( vhashB, hash2, hash3 ); +#endif + shavite512_2way_init( &ctx.shavite ); shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); shavite512_2way_init( &ctx.shavite ); @@ -1443,6 +1555,20 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) shabal512_4way_update( &ctx.shabal, vhashB, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); +#if defined(__VAES__) + +// rintrlv_4x32_2x128( vhashA, vhashB, vhash, 512 ); + dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_2x128_512( vhashA, hash0, hash1 ); + intrlv_2x128_512( vhashB, hash2, hash3 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1452,6 +1578,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1476,6 +1604,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1490,6 +1627,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -1523,6 +1662,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1532,6 +1682,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1556,6 +1708,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1570,6 +1731,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -1616,6 +1779,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -1625,6 +1799,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -1649,6 +1825,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -1663,6 +1848,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c index 926beb4..d192b0d 100644 --- a/algo/x17/sonoa-gate.c +++ b/algo/x17/sonoa-gate.c @@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate ) init_sonoa_ctx(); gate->hash = (void*)&sonoa_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; return true; }; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index cce3894..fcff0b6 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -240,7 +240,13 @@ union _x17_4way_context_overlay { blake512_4way_context blake; bmw512_4way_context bmw; +#if defined(__VAES__) + groestl512_2way_context groestl; + echo512_2way_context echo; +#else hashState_groestl groestl; + hashState_echo echo; +#endif skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; @@ -248,7 +254,6 @@ union _x17_4way_context_overlay cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hashState_echo echo; hamsi512_4way_context hamsi; hashState_fugue fugue; shabal512_4way_context shabal; @@ -275,6 +280,17 @@ int x17_4way_hash( void *state, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); @@ -284,6 +300,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); jh512_4way_init( &ctx.jh ); @@ -308,6 +326,15 @@ int x17_4way_hash( void *state, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -322,6 +349,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index eee3d60..6ab09ff 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate ) #else gate->hash = (void*)&x17_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; return true; }; diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index beb9df6..e7cd2bf 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -405,15 +405,20 @@ union _xevan_4way_context_overlay { blake512_4way_context blake; bmw512_4way_context bmw; - hashState_groestl groestl; - skein512_4way_context skein; +#if defined(__VAES__) + groestl512_2way_context groestl; + echo_2way_context echo; +#else + hashState_groestl groestl; + hashState_echo echo; +#endif + skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hashState_echo echo; hamsi512_4way_context hamsi; hashState_fugue fugue; shabal512_4way_context shabal; @@ -442,7 +447,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, dataLen ); bmw512_4way_close( &ctx.bmw, vhash ); - // Serial +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); @@ -450,9 +465,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 ); groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 ); - // Parallel 4way intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, dataLen ); jh512_4way_init( &ctx.jh ); @@ -477,6 +493,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen ); simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); @@ -489,9 +514,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) echo_full( &ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)hash3, dataLen ); - // Parallel intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); @@ -542,6 +568,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) bmw512_4way_update( &ctx.bmw, vhash, dataLen ); bmw512_4way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); @@ -551,6 +588,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); +#endif + skein512_4way_full( &ctx.skein, vhash, vhash, dataLen ); jh512_4way_init( &ctx.jh ); @@ -575,6 +614,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen ); simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); @@ -589,6 +637,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); +#endif + hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c index 184ed2d..545a0aa 100644 --- a/algo/x17/xevan-gate.c +++ b/algo/x17/xevan-gate.c @@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate ) init_xevan_ctx(); gate->hash = (void*)&xevan_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT; opt_target_factor = 256.0; return true; }; diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index e61d1ad..ba5714b 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -11,7 +11,7 @@ #include "algo/shavite/shavite-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" -#include "algo/shavite/sph_shavite.h" +#include "algo/shavite/shavite-hash-2way.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/fugue-aesni.h" #include "algo/shabal/shabal-hash-4way.h" @@ -494,14 +494,19 @@ union _x22i_4way_ctx_overlay { blake512_4way_context blake; bmw512_4way_context bmw; +#if defined(__VAES__) + groestl512_2way_context groestl; + echo_2way_context echo; +#else hashState_groestl groestl; hashState_echo echo; +#endif + shavite512_2way_context shavite; skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; cube_2way_context cube; - shavite512_2way_context shavite; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; @@ -535,14 +540,28 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - groestl512_full( &ctx.groestl, (char*)hash0, (const char*)hash0, 512 ); - groestl512_full( &ctx.groestl, (char*)hash1, (const char*)hash1, 512 ); - groestl512_full( &ctx.groestl, (char*)hash2, (const char*)hash2, 512 ); - groestl512_full( &ctx.groestl, (char*)hash3, (const char*)hash3, 512 ); +#if defined(__VAES__) - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + + intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); + +#endif skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); @@ -570,6 +589,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); @@ -584,6 +612,8 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); +#endif + if ( work_restart[thrid].restart ) return false; hamsi512_4way_init( &ctx.hamsi ); diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index 78f23b4..243f69e 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -20,7 +20,7 @@ bool register_x22i_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x22i; gate->hash = (void*)&x22i_hash; gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; + | AVX512_OPT | VAES_OPT | VAES256_OPT; #endif return true; }; @@ -30,20 +30,15 @@ bool register_x25x_algo( algo_gate_t* gate ) #if defined (X25X_8WAY) gate->scanhash = (void*)&scanhash_x25x_8way; gate->hash = (void*)&x25x_8way_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT - | AVX512_OPT | VAES_OPT; #elif defined (X25X_4WAY) gate->scanhash = (void*)&scanhash_x25x_4way; gate->hash = (void*)&x25x_4way_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; #else gate->scanhash = (void*)&scanhash_x25x; gate->hash = (void*)&x25x_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT - | AVX512_OPT | VAES_OPT; #endif -// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | + VAES_OPT | VAES256_OPT; return true; }; diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index 1cdea11..e44a82e 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -15,6 +15,7 @@ #include "algo/cubehash/cubehash_sse2.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cube-hash-2way.h" +#include "algo/shavite/shavite-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" #include "algo/simd/simd-hash-2way.h" @@ -412,9 +413,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash6[19], hash7[19], vhash, 256 ); - sph_gost512_init(&ctx.gost); - sph_gost512 (&ctx.gost, (const void*) hash0[19], 64); - sph_gost512_close(&ctx.gost, (void*) hash0[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash0[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash0[20]); sph_gost512_init(&ctx.gost); sph_gost512 (&ctx.gost, (const void*) hash1[19], 64); sph_gost512_close(&ctx.gost, (void*) hash1[20]); @@ -574,68 +575,26 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, return 0; } -/* -int scanhash_x25x_8way( struct work* work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*16] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<3]); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - __m512i *noncev = (__m512i*)vdata + 9; // aligned - uint32_t n = first_nonce; - const uint32_t last_nonce = max_nonce - 4; - const int thr_id = mythr->id; - const uint32_t Htarg = ptarget[7]; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x08ff; - - InitializeSWIFFTX(); - - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - do - { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x25x_8way_hash( hash, vdata ); - - for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) - { - extr_lane_8x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); - } - } - n += 8; - } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); - - *hashes_done = n - first_nonce; - return 0; -} -*/ - #elif defined(X25X_4WAY) union _x25x_4way_ctx_overlay { blake512_4way_context blake; bmw512_4way_context bmw; +#if defined(__VAES__) + groestl512_2way_context groestl; + echo_2way_context echo; +#else hashState_groestl groestl; hashState_echo echo; +#endif skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; + luffa_2way_context luffa; + cube_2way_context cube; + shavite512_2way_context shavite; + simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; shabal512_4way_context shabal; @@ -658,6 +617,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) unsigned char hash2[25][64] __attribute__((aligned(64))) = {0}; unsigned char hash3[25][64] __attribute__((aligned(64))) = {0}; unsigned char vhashX[24][64*4] __attribute__ ((aligned (64))); + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64))); blake512_4way_full( &ctx.blake, vhash, input, 80 ); @@ -668,11 +629,25 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash ); +#if defined(__VAES__) + + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); + + groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 ); + + dintrlv_2x128_512( hash0[2], hash1[2], vhashA ); + dintrlv_2x128_512( hash2[2], hash3[2], vhashB ); + +#else + groestl512_full( &ctx.groestl, (char*)hash0[2], (const char*)hash0[1], 512 ); groestl512_full( &ctx.groestl, (char*)hash1[2], (const char*)hash1[1], 512 ); groestl512_full( &ctx.groestl, (char*)hash2[2], (const char*)hash2[1], 512 ); groestl512_full( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 ); +#endif + intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] ); skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash ); @@ -689,41 +664,38 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash ); - luffa_full( &ctx.luffa, (BitSequence*)hash0[6], 512, - (const BitSequence*)hash0[5], 64 ); - luffa_full( &ctx.luffa, (BitSequence*)hash1[6], 512, - (const BitSequence*)hash1[5], 64 ); - luffa_full( &ctx.luffa, (BitSequence*)hash2[6], 512, - (const BitSequence*)hash2[5], 64 ); - luffa_full( &ctx.luffa, (BitSequence*)hash3[6], 512, - (const BitSequence*)hash3[5], 64 ); + rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - cubehash_full( &ctx.cube, (byte*)hash0[7], 512, (const byte*)hash0[6], 64 ); - cubehash_full( &ctx.cube, (byte*)hash1[7], 512, (const byte*)hash1[6], 64 ); - cubehash_full( &ctx.cube, (byte*)hash2[7], 512, (const byte*)hash2[6], 64 ); - cubehash_full( &ctx.cube, (byte*)hash3[7], 512, (const byte*)hash3[6], 64 ); + luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 ); + luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 ); + dintrlv_2x128_512( hash0[6], hash1[6], vhashA ); + dintrlv_2x128_512( hash2[6], hash3[6], vhashB ); - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64); - sph_shavite512_close(&ctx.shavite, hash0[8]); - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64); - sph_shavite512_close(&ctx.shavite, hash1[8]); - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64); - sph_shavite512_close(&ctx.shavite, hash2[8]); - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64); - sph_shavite512_close(&ctx.shavite, hash3[8]); + cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); + cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); + dintrlv_2x128_512( hash0[7], hash1[7], vhashA ); + dintrlv_2x128_512( hash2[7], hash3[7], vhashB ); - simd_full( &ctx.simd, (BitSequence*)hash0[9], - (const BitSequence*)hash0[8], 512 ); - simd_full( &ctx.simd, (BitSequence*)hash1[9], - (const BitSequence*)hash1[8], 512 ); - simd_full( &ctx.simd, (BitSequence*)hash2[9], - (const BitSequence*)hash2[8], 512 ); - simd_full( &ctx.simd, (BitSequence*)hash3[9], - (const BitSequence*)hash3[8], 512 ); + shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 ); + dintrlv_2x128_512( hash0[8], hash1[8], vhashA ); + dintrlv_2x128_512( hash2[8], hash3[8], vhashB ); + + simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); + simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); + dintrlv_2x128_512( hash0[9], hash1[9], vhashA ); + dintrlv_2x128_512( hash2[9], hash3[9], vhashB ); + +#if defined(__VAES__) + + echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); + dintrlv_2x128_512( hash0[10], hash1[10], vhashA ); + dintrlv_2x128_512( hash2[10], hash3[10], vhashB ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); + +#else echo_full( &ctx.echo, (BitSequence *)hash0[10], 512, (const BitSequence *)hash0[ 9], 64 ); @@ -736,6 +708,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] ); +#endif + if ( work_restart[thrid].restart ) return 0; hamsi512_4way_init( &ctx.hamsi ); diff --git a/build-allarch.sh b/build-allarch.sh index 50a5865..eb1f71b 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,8 +4,9 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null +# Icelake AVX512 SHA VAES make distclean || echo clean rm -f config.status ./autogen.sh || echo done @@ -16,6 +17,20 @@ mv cpuminer.exe cpuminer-avx512-sha-vaes.exe strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes +# Rocketlake AVX512 AES SHA +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl +# CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl +make -j 8 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-avx512-sha.exe +strip -s cpuminer +mv cpuminer cpuminer-avx512-sha + +# Slylake-X AVX512 AES +make clean || echo clean +rm -f config.status CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl make -j 8 strip -s cpuminer.exe @@ -23,6 +38,7 @@ mv cpuminer.exe cpuminer-avx512.exe strip -s cpuminer mv cpuminer cpuminer-avx512 +# Haswell AVX2 AES make clean || echo clean rm -f config.status # GCC 9 doesn't include AES with core-avx2 @@ -33,6 +49,7 @@ mv cpuminer.exe cpuminer-avx2.exe strip -s cpuminer mv cpuminer cpuminer-avx2 +# Sandybridge AVX AES make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl @@ -42,15 +59,17 @@ mv cpuminer.exe cpuminer-avx.exe strip -s cpuminer mv cpuminer cpuminer-avx +# Westmere SSE4.2 AES make clean || echo clean rm -f config.status -CFLAGS="-O3 -maes -msse4.2 -Wall -fno-common" ./configure --with-curl +CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl make -j 8 strip -s cpuminer.exe mv cpuminer.exe cpuminer-aes-sse42.exe strip -s cpuminer mv cpuminer cpuminer-aes-sse42 +# Nehalem SSE4.2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl @@ -60,6 +79,7 @@ mv cpuminer.exe cpuminer-sse42.exe strip -s cpuminer mv cpuminer cpuminer-sse42 +# Core2 SSSE3 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl @@ -69,6 +89,7 @@ mv cpuminer.exe cpuminer-ssse3.exe strip -s cpuminer mv cpuminer cpuminer-ssse3 +# Generic SSE2 make clean || echo clean rm -f config.status CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl @@ -78,6 +99,7 @@ mv cpuminer.exe cpuminer-sse2.exe strip -s cpuminer mv cpuminer cpuminer-sse2 +# Zen1 AVX2 SHA make clean || echo done rm -f config.status CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl @@ -87,6 +109,7 @@ mv cpuminer.exe cpuminer-zen.exe strip -s cpuminer mv cpuminer cpuminer-zen +# Zen3 AVX2 SHA VAES make clean || echo done rm -f config.status CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl @@ -97,6 +120,7 @@ mv cpuminer.exe cpuminer-zen3.exe strip -s cpuminer mv cpuminer cpuminer-zen3 +# Native to current CPU make clean || echo done rm -f config.status CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl diff --git a/build-avx2.sh b/build-avx2.sh deleted file mode 100755 index 7a12473..0000000 --- a/build-avx2.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -#if [ "$OS" = "Windows_NT" ]; then -# ./mingw64.sh -# exit 0 -#fi - -# Linux build - -make distclean || echo clean - -rm -f config.status -./autogen.sh || echo done - -# Ubuntu 10.04 (gcc 4.4) -# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) -#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" - -#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr -CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl - -make -j 4 - -strip -s cpuminer diff --git a/build-no-common.sh b/build-no-common.sh deleted file mode 100755 index 60b6da5..0000000 --- a/build-no-common.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -#if [ "$OS" = "Windows_NT" ]; then -# ./mingw64.sh -# exit 0 -#fi - -# Linux build - -make distclean || echo clean - -rm -f config.status -./autogen.sh || echo done - -# Ubuntu 10.04 (gcc 4.4) -# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) -#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" - -#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr -CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl - -make -j 4 - -strip -s cpuminer diff --git a/build.sh b/build.sh index bf713ea..39bf5f6 100755 --- a/build.sh +++ b/build.sh @@ -12,15 +12,8 @@ make distclean || echo clean rm -f config.status ./autogen.sh || echo done -# Ubuntu 10.04 (gcc 4.4) -# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) -#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" - #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl make -j 4 diff --git a/buildjdd.sh b/buildjdd.sh deleted file mode 100755 index df17f22..0000000 --- a/buildjdd.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -#if [ "$OS" = "Windows_NT" ]; then -# ./mingw64.sh -# exit 0 -#fi - -# Linux build - -make distclean || echo clean - -rm -f config.status -./autogen.sh || echo done - -# Ubuntu 10.04 (gcc 4.4) -# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) -#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" - -CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl - -make -j 4 - -strip -s cpuminer diff --git a/clean-all.sh b/clean-all.sh index 2ca980e..e91bbb5 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,8 +2,8 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null -rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null +rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null make distclean > /dev/null diff --git a/configure b/configure index fcbefb8..5955782 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.15.1' -PACKAGE_STRING='cpuminer-opt 3.15.1' +PACKAGE_VERSION='3.15.2' +PACKAGE_STRING='cpuminer-opt 3.15.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.15.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.15.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.15.1 +cpuminer-opt configure 3.15.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.15.1, which was +It was created by cpuminer-opt $as_me 3.15.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.15.1' + VERSION='3.15.2' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.15.1, which was +This file was extended by cpuminer-opt $as_me 3.15.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.15.1 +cpuminer-opt config.status 3.15.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index c0b3c5f..b5c82d2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.15.1]) +AC_INIT([cpuminer-opt], [3.15.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index ebfdb7d..e459e62 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3383,13 +3383,14 @@ bool check_cpu_capability () bool sw_has_sha = false; bool sw_has_vaes = false; set_t algo_features = algo_gate.optimizations; - bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); - bool algo_has_aes = set_incl( AES_OPT, algo_features ); - bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); - bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); - bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); - bool algo_has_sha = set_incl( SHA_OPT, algo_features ); - bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); + bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); + bool algo_has_aes = set_incl( AES_OPT, algo_features ); + bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); + bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); + bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); + bool algo_has_sha = set_incl( SHA_OPT, algo_features ); + bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); + bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features ); bool use_aes; bool use_sse2; bool use_sse42; @@ -3510,7 +3511,8 @@ bool check_cpu_capability () use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512; + use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes + && ( use_avx512 || algo_has_vaes256 ); use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || use_sha || use_vaes ); diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 155293a..5f94cbc 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -143,8 +143,8 @@ do { \ // Parallel AES, for when x is expected to be in a 256 bit register. // Use same 128 bit key. -//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -#if 0 +#if defined(__VAES__) + #define mm256_aesenc_2x128( x, k ) \ _mm256_aesenc_epi128( x, k ) diff --git a/sysinfos.c b/sysinfos.c index 1d5cdf3..010c78f 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -483,11 +483,13 @@ static inline bool has_avx512() // AMD Zen3 added support for 256 bit VAES without requiring AVX512. // The original Intel spec requires AVX512F to support 512 bit VAES and // requires AVX512VL to support 256 bit VAES. -// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient. -// However, proper detection of VAES512 and VAES256 requires more work: -// VAES512 = VAES && AVX512F (may not support VAES256) -// VAES256 = AVX512VL ? VAES : ( AVX && VAES ) (may not support VAES512) -// VAES = VAES && AVX512F && AVX512VL (supports both) +// The CPUID VAES bit alone can't distiguish 256 vs 512 bit. +// If necessary: +// VAES 256 & 512 = VAES && AVX512VL +// VAES 512 = VAES && AVX512F +// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F ) +// VAES 512 only = VAES && AVX512F && !AVX512VL +// VAES 256 only = VAES && !AVX512F static inline bool has_vaes() { diff --git a/winbuild-cross.sh b/winbuild-cross.sh index f19bbb6..5e3542b 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -40,6 +40,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ # Start building... +# Icelake AVX512 SHA VAES ./clean-all.sh || echo clean rm -f config.status ./autogen.sh || echo done @@ -48,6 +49,7 @@ make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe +# Zen1 AVX2 SHA make clean || echo clean rm -f config.status CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS @@ -55,6 +57,16 @@ make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-zen.exe +# Zen3 AVX2 SHA VAES +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS +# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS +make -j 8 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-zen3.exe + +# Slylake-X AVX512 AES # mingw won't compile avx512 without -fno-asynchronous-unwind-tables make clean || echo clean rm -f config.status @@ -64,6 +76,7 @@ make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512.exe +# Haswell AVX2 AES make clean || echo clean rm -f config.status # GCC 9 doesn't include AES in -march=core-avx2 @@ -72,6 +85,7 @@ make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe +# Sandybridge AVX AES make clean || echo clean rm -f config.status # -march=corei7-avx still includes aes, but just in case @@ -80,6 +94,7 @@ make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx.exe +# Westmere SSE4.2 AES # -march=westmere is supported in gcc5 make clean || echo clean rm -f config.status @@ -104,6 +119,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe #mv cpuminer.exe release/cpuminer-ssse3.exe #make clean || echo clean +# Generic SSE2 make clean || echo clean rm -f config.status CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS