diff --git a/INSTALL_LINUX b/INSTALL_LINUX index 264b828..e2a0953 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -24,18 +24,10 @@ be installed manually. There may be others, read the error messages they will give a clue as to the missing package. The following command should install everything you need on Debian based -distributions such as Ubuntu: +distributions such as Ubuntu. Fedora and other distributions may have similar +but different package names. -sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev - -build-essential (Development Tools package group on Fedora) -automake -libjansson-dev -libgmp-dev -libcurl4-openssl-dev -libssl-dev -lib-thread -zlib1g-dev +sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1.0e or higher. Add one of the following, depending on the diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS index 75d6d70..f2e2c80 100644 --- a/INSTALL_WINDOWS +++ b/INSTALL_WINDOWS @@ -22,14 +22,13 @@ Step by step... Refer to Linux compile instructions and install required packages. -Additionally, install mingw-64. +Additionally, install mingw-w64. sudo apt-get install mingw-w64 2. Create a local library directory for packages to be compiled in the next - step. Recommended location is $HOME/usr/lib/ - + step. Suggested location is $HOME/usr/lib/ 3. Download and build other packages for mingw that don't have a mingw64 version available in the repositories. diff --git a/Makefile.am b/Makefile.am index e12dced..a2ba0fc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -174,7 +174,6 @@ cpuminer_SOURCES = \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ algo/sha/sha512-hash-4way.c \ - algo/sha/sha256_hash_11way.c \ algo/sha/sha2.c \ algo/sha/sha256t-gate.c \ algo/sha/sha256t-4way.c \ @@ -198,7 +197,6 @@ cpuminer_SOURCES = \ algo/skein/skein-gate.c \ algo/skein/skein2.c \ algo/skein/skein2-4way.c \ - algo/skein/skein2-gate.c \ algo/sm3/sm3.c \ algo/sm3/sm3-hash-4way.c \ algo/swifftx/swifftx.c \ diff --git a/README.txt b/README.txt index 0103099..0c9e3e9 100644 --- a/README.txt +++ b/README.txt @@ -29,6 +29,7 @@ cpuminer-sse2.exe "-msse2" Core2, Nehalem cpuminer-aes-sse42.exe "-march=westmere" Westmere cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake +cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper If you like this software feel free to donate: diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 8124aa6..69866e6 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -31,6 +31,26 @@ FreeBSD YMMV. Change Log ---------- +v3.10.0 + +AVX-512 is now supported on selected algos, Windows binary is now available. +AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc, +skein & skein2. + +Fixed CPU temperature for some CPU models (Linux only). + +Fixed a bug that caused some lanes not to submit shares. + +Fixed some previously undetected buffer overflows. + +Lyra2rev2 3% faster SSE2 and AVX2. + +Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows +to fix known mingw issue. + +Changed AVX2 build script to explicitly add AES to address change in +behaviour in GCC 9. + v3.9.11 Added x22i & x25x algos. diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 03ebe9d..6c52ffb 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -59,7 +59,6 @@ extern "C"{ typedef struct { unsigned char buf[64<<2]; uint32_t H[8<<2]; - uint32_t S[4<<2]; // __m128i buf[16] __attribute__ ((aligned (64))); // __m128i H[8]; // __m128i S[4]; @@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst); typedef struct { __m256i buf[16] __attribute__ ((aligned (64))); __m256i H[8]; - __m256i S[4]; size_t ptr; sph_u32 T0, T1; int rounds; // 14 for blake, 8 for blakecoin & vanilla diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index 63cee58..6597b6b 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -304,16 +304,17 @@ static const sph_u32 CS[16] = { #endif +// Blake-256 4 way #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \ do { \ - a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set1_epi32( c1 ), m0 ), b ), a ); \ + a = _mm_add_epi32( _mm_add_epi32( a, b ), \ + _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \ d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ - a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set1_epi32( c0 ), m1 ), b ), a ); \ + a = _mm_add_epi32( _mm_add_epi32( a, b ), \ + _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \ d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ @@ -321,7 +322,8 @@ do { \ #if SPH_COMPACT_BLAKE_32 -// Blake-256 4 way +// Not used +#if 0 #define ROUND_S_4WAY(r) do { \ GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \ @@ -342,6 +344,8 @@ do { \ CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ } while (0) +#endif + #else #define ROUND_S_4WAY(r) do { \ @@ -359,7 +363,6 @@ do { \ #define DECL_STATE32_4WAY \ __m128i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m128i S0, S1, S2, S3; \ uint32_t T0, T1; #define READ_STATE32_4WAY(state) do { \ @@ -371,10 +374,6 @@ do { \ H5 = casti_m128i( state->H, 5 ); \ H6 = casti_m128i( state->H, 6 ); \ H7 = casti_m128i( state->H, 7 ); \ - S0 = casti_m128i( state->S, 0 ); \ - S1 = casti_m128i( state->S, 1 ); \ - S2 = casti_m128i( state->S, 2 ); \ - S3 = casti_m128i( state->S, 3 ); \ T0 = (state)->T0; \ T1 = (state)->T1; \ } while (0) @@ -388,17 +387,13 @@ do { \ casti_m128i( state->H, 5 ) = H5; \ casti_m128i( state->H, 6 ) = H6; \ casti_m128i( state->H, 7 ) = H7; \ - casti_m128i( state->S, 0 ) = S0; \ - casti_m128i( state->S, 1 ) = S1; \ - casti_m128i( state->S, 2 ) = S2; \ - casti_m128i( state->S, 3 ) = S3; \ (state)->T0 = T0; \ (state)->T1 = T1; \ } while (0) #if SPH_COMPACT_BLAKE_32 // not used - +#if 0 #define COMPRESS32_4WAY( rounds ) do { \ __m128i M[16]; \ __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -441,6 +436,7 @@ do { \ H7 = _mm_xor_si128( _mm_xor_si128( \ _mm_xor_si128( S3, V7 ), VF ), H7 ); \ } while (0) +#endif #else @@ -508,10 +504,10 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \ - V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \ - VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \ - VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \ + V8 = m128_const1_64( 0x243F6A88243F6A88 ); \ + V9 = m128_const1_64( 0x85A308D385A308D3 ); \ + VA = m128_const1_64( 0x13198A2E13198A2E ); \ + VB = m128_const1_64( 0x0370734403707344 ); \ VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \ m128_const1_64( 0xA4093822A4093822 ) ); \ VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \ @@ -538,14 +534,14 @@ do { \ ROUND_S_4WAY(2); \ ROUND_S_4WAY(3); \ } \ - H0 = mm128_xor4( V8, V0, S0, H0 ); \ - H1 = mm128_xor4( V9, V1, S1, H1 ); \ - H2 = mm128_xor4( VA, V2, S2, H2 ); \ - H3 = mm128_xor4( VB, V3, S3, H3 ); \ - H4 = mm128_xor4( VC, V4, S0, H4 ); \ - H5 = mm128_xor4( VD, V5, S1, H5 ); \ - H6 = mm128_xor4( VE, V6, S2, H6 ); \ - H7 = mm128_xor4( VF, V7, S3, H7 ); \ + H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \ + H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \ + H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \ + H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \ + H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \ + H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \ + H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \ + H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \ } while (0) #endif @@ -556,13 +552,13 @@ do { \ #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \ do { \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set1_epi32( c1 ), m0 ), b ), a ); \ + a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ + _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \ d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set1_epi32( c0 ), m1 ), b ), a ); \ + a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ + _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \ d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ @@ -581,7 +577,6 @@ do { \ #define DECL_STATE32_8WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ - __m256i S0, S1, S2, S3; \ sph_u32 T0, T1; #define READ_STATE32_8WAY(state) \ @@ -594,10 +589,6 @@ do { \ H5 = (state)->H[5]; \ H6 = (state)->H[6]; \ H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ T0 = (state)->T0; \ T1 = (state)->T1; \ } while (0) @@ -612,10 +603,6 @@ do { \ (state)->H[5] = H5; \ (state)->H[6] = H6; \ (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ (state)->T0 = T0; \ (state)->T1 = T1; \ } while (0) @@ -635,10 +622,10 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \ - V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \ - VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \ - VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \ + V8 = m256_const1_64( 0x243F6A88243F6A88 ); \ + V9 = m256_const1_64( 0x85A308D385A308D3 ); \ + VA = m256_const1_64( 0x13198A2E13198A2E ); \ + VB = m256_const1_64( 0x0370734403707344 ); \ VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ m256_const1_64( 0xA4093822A4093822 ) ); \ VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ @@ -682,14 +669,14 @@ do { \ ROUND_S_8WAY(2); \ ROUND_S_8WAY(3); \ } \ - H0 = mm256_xor4( V8, V0, S0, H0 ); \ - H1 = mm256_xor4( V9, V1, S1, H1 ); \ - H2 = mm256_xor4( VA, V2, S2, H2 ); \ - H3 = mm256_xor4( VB, V3, S3, H3 ); \ - H4 = mm256_xor4( VC, V4, S0, H4 ); \ - H5 = mm256_xor4( VD, V5, S1, H5 ); \ - H6 = mm256_xor4( VE, V6, S2, H6 ); \ - H7 = mm256_xor4( VF, V7, S3, H7 ); \ + H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \ + H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \ + H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \ + H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \ + H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \ + H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \ + H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \ + H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \ } while (0) @@ -703,7 +690,6 @@ static void blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, const uint32_t *salt, int rounds ) { - __m128i zero = m128_zero; casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 ); casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 ); casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 ); @@ -712,11 +698,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C ); casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB ); casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 ); - - casti_m128i( ctx->S, 0 ) = zero; - casti_m128i( ctx->S, 1 ) = zero; - casti_m128i( ctx->S, 2 ) = zero; - casti_m128i( ctx->S, 3 ) = zero; ctx->T0 = ctx->T1 = 0; ctx->ptr = 0; ctx->rounds = rounds; @@ -824,7 +805,6 @@ static void blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv, const sph_u32 *salt, int rounds ) { - __m256i zero = m256_zero; casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 ); casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 ); casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 ); @@ -833,10 +813,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv, casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C ); casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 ); - casti_m256i( sc->S, 0 ) = zero; - casti_m256i( sc->S, 1 ) = zero; - casti_m256i( sc->S, 2 ) = zero; - casti_m256i( sc->S, 3 ) = zero; sc->T0 = sc->T1 = 0; sc->ptr = 0; sc->rounds = rounds; diff --git a/algo/blake/blake2b-4way.c b/algo/blake/blake2b-4way.c index 1bd3b2c..1790aa0 100644 --- a/algo/blake/blake2b-4way.c +++ b/algo/blake/blake2b-4way.c @@ -4,13 +4,59 @@ */ #include "blake2b-gate.h" - -#if defined(BLAKE2B_4WAY) - #include #include #include "blake2b-hash-4way.h" +#if defined(BLAKE2B_8WAY) + +int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128)));; + uint32_t vdata[20*8] __attribute__ ((aligned (64)));; + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + blake2b_8way_ctx ctx __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[25]); // 3*8+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + int thr_id = mythr->id; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + + uint32_t n = first_nonce; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + do { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + blake2b_8way_init( &ctx ); + blake2b_8way_update( &ctx, vdata, 80 ); + blake2b_8way_final( &ctx, hash ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane<<1 ] < Htarg ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return 0; +} + +#elif defined(BLAKE2B_4WAY) + // Function not used, code inlined. void blake2b_4way_hash(void *output, const void *input) { diff --git a/algo/blake/blake2b-gate.c b/algo/blake/blake2b-gate.c index da8851c..549563a 100644 --- a/algo/blake/blake2b-gate.c +++ b/algo/blake/blake2b-gate.c @@ -1,15 +1,19 @@ #include "blake2b-gate.h" + bool register_blake2b_algo( algo_gate_t* gate ) { -#if defined(BLAKE2B_4WAY) +#if defined(BLAKE2B_8WAY) + gate->scanhash = (void*)&scanhash_blake2b_8way; +// gate->hash = (void*)&blake2b_8way_hash; +#elif defined(BLAKE2B_4WAY) gate->scanhash = (void*)&scanhash_blake2b_4way; gate->hash = (void*)&blake2b_4way_hash; #else gate->scanhash = (void*)&scanhash_blake2b; gate->hash = (void*)&blake2b_hash; #endif - gate->optimizations = AVX2_OPT; + gate->optimizations = AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/blake/blake2b-gate.h b/algo/blake/blake2b-gate.h index 4ba67f6..cc28919 100644 --- a/algo/blake/blake2b-gate.h +++ b/algo/blake/blake2b-gate.h @@ -4,13 +4,21 @@ #include #include "algo-gate-api.h" -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define BLAKE2B_8WAY +#elif defined(__AVX2__) #define BLAKE2B_4WAY #endif bool register_blake2b_algo( algo_gate_t* gate ); -#if defined(BLAKE2B_4WAY) +#if defined(BLAKE2B_8WAY) + +//void blake2b_8way_hash( void *state, const void *input ); +int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(BLAKE2B_4WAY) void blake2b_4way_hash( void *state, const void *input ); int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c index 93532d5..246716f 100644 --- a/algo/blake/blake2b-hash-4way.c +++ b/algo/blake/blake2b-hash-4way.c @@ -33,6 +33,178 @@ #include "blake2b-hash-4way.h" +static const uint8_t sigma[12][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define B2B8W_G(a, b, c, d, x, y) \ +{ \ + v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \ + v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \ + v[c] = _mm512_add_epi64( v[c], v[d] ); \ + v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \ + v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \ + v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \ + v[c] = _mm512_add_epi64( v[c], v[d] ); \ + v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \ +} + +static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) +{ + __m512i v[16], m[16]; + + v[ 0] = ctx->h[0]; + v[ 1] = ctx->h[1]; + v[ 2] = ctx->h[2]; + v[ 3] = ctx->h[3]; + v[ 4] = ctx->h[4]; + v[ 5] = ctx->h[5]; + v[ 6] = ctx->h[6]; + v[ 7] = ctx->h[7]; + v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 ); + v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B ); + v[10] = m512_const1_64( 0x3C6EF372FE94F82B ); + v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 ); + v[12] = m512_const1_64( 0x510E527FADE682D1 ); + v[13] = m512_const1_64( 0x9B05688C2B3E6C1F ); + v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B ); + v[15] = m512_const1_64( 0x5BE0CD19137E2179 ); + + v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) ); + v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) ); + + if ( last ) + v[14] = mm512_not( v[14] ); + + m[ 0] = ctx->b[ 0]; + m[ 1] = ctx->b[ 1]; + m[ 2] = ctx->b[ 2]; + m[ 3] = ctx->b[ 3]; + m[ 4] = ctx->b[ 4]; + m[ 5] = ctx->b[ 5]; + m[ 6] = ctx->b[ 6]; + m[ 7] = ctx->b[ 7]; + m[ 8] = ctx->b[ 8]; + m[ 9] = ctx->b[ 9]; + m[10] = ctx->b[10]; + m[11] = ctx->b[11]; + m[12] = ctx->b[12]; + m[13] = ctx->b[13]; + m[14] = ctx->b[14]; + m[15] = ctx->b[15]; + + for ( int i = 0; i < 12; i++ ) + { + B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] ); + B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] ); + B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] ); + B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] ); + B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] ); + B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] ); + B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] ); + B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); + } + + ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] ); + ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] ); + ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] ); + ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] ); + ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] ); + ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] ); + ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] ); + ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] ); +} + +int blake2b_8way_init( blake2b_8way_ctx *ctx ) +{ + size_t i; + + ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 ); + ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B ); + ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B ); + ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 ); + ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 ); + ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F ); + ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B ); + ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 ); + + ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) ); + + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->c = 0; + ctx->outlen = 32; + + for ( i = 0; i < 16; i++ ) + ctx->b[i] = m512_zero; + + return 0; +} + + +void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, + size_t inlen ) +{ + __m512i* in =(__m512i*)input; + + size_t i, c; + c = ctx->c >> 3; + + for ( i = 0; i < (inlen >> 3); i++ ) + { + if ( ctx->c == 128 ) + { + ctx->t[0] += ctx->c; + if ( ctx->t[0] < ctx->c ) + ctx->t[1]++; + blake2b_8way_compress( ctx, 0 ); + ctx->c = 0; + } + ctx->b[ c++ ] = in[i]; + ctx->c += 8; + } +} + +void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ) +{ + size_t c; + c = ctx->c >> 3; + + ctx->t[0] += ctx->c; + if ( ctx->t[0] < ctx->c ) + ctx->t[1]++; + + while ( ctx->c < 128 ) + { + ctx->b[c++] = m512_zero; + ctx->c += 8; + } + + blake2b_8way_compress( ctx, 1 ); // final block flag = 1 + + casti_m512i( out, 0 ) = ctx->h[0]; + casti_m512i( out, 1 ) = ctx->h[1]; + casti_m512i( out, 2 ) = ctx->h[2]; + casti_m512i( out, 3 ) = ctx->h[3]; +} + +#endif + #if defined(__AVX2__) // G Mixing function. @@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = { static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) { - const uint8_t sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } - }; - int i; __m256i v[16], m[16]; v[ 0] = ctx->h[0]; @@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) m[14] = ctx->b[14]; m[15] = ctx->b[15]; - for ( i = 0; i < 12; i++ ) + for ( int i = 0; i < 12; i++ ) { B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] ); B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] ); diff --git a/algo/blake/blake2b-hash-4way.h b/algo/blake/blake2b-hash-4way.h index 30abd15..979e4b2 100644 --- a/algo/blake/blake2b-hash-4way.h +++ b/algo/blake/blake2b-hash-4way.h @@ -2,8 +2,6 @@ #ifndef __BLAKE2B_HASH_4WAY_H__ #define __BLAKE2B_HASH_4WAY_H__ -#if defined(__AVX2__) - #include "simd-utils.h" #include #include @@ -16,14 +14,34 @@ #define ALIGN(x) __attribute__((aligned(x))) #endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +ALIGN(128) typedef struct { + __m512i b[16]; // input buffer + __m512i h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} blake2b_8way_ctx; + +int blake2b_8way_init( blake2b_8way_ctx *ctx ); +void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, + size_t inlen ); +void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ); + +#endif + +#if defined(__AVX2__) + // state context -ALIGN(64) typedef struct { +ALIGN(128) typedef struct { __m256i b[16]; // input buffer __m256i h[8]; // chained state uint64_t t[2]; // total number of bytes size_t c; // pointer for b[] size_t outlen; // digest size -} blake2b_4way_ctx __attribute__((aligned(64))); +} blake2b_4way_ctx; int blake2b_4way_init( blake2b_4way_ctx *ctx ); void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, diff --git a/algo/blake/blake2s-4way.c b/algo/blake/blake2s-4way.c index 9048566..9a5c6eb 100644 --- a/algo/blake/blake2s-4way.c +++ b/algo/blake/blake2s-4way.c @@ -3,22 +3,72 @@ #include #include -#if defined(BLAKE2S_8WAY) +#if defined(BLAKE2S_16WAY) + +static __thread blake2s_16way_state blake2s_16w_ctx; + +void blake2s_16way_hash( void *output, const void *input ) +{ + blake2s_16way_state ctx; + memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx ); + blake2s_16way_update( &ctx, input + (64<<4), 16 ); + blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES ); +} + +int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[20*16] __attribute__ ((aligned (128))); + uint32_t hash[8*16] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<4]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + __m512i *noncev = (__m512i*)vdata + 19; // aligned + uint32_t n = first_nonce; + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_16x32( vdata, pdata ); + blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); + blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 ); + + do { + *noncev = mm512_bswap_32( _mm512_set_epi32( + n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, + n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) ); + pdata[19] = n; + + blake2s_16way_hash( hash, vdata ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash7[lane] <= Htarg ) ) + { + extr_lane_16x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 16; + } while ( (n < max_nonce-16) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce + 1; + return 0; +} + +#elif defined(BLAKE2S_8WAY) static __thread blake2s_8way_state blake2s_8w_ctx; void blake2s_8way_hash( void *output, const void *input ) { - uint32_t vhash[8*8] __attribute__ ((aligned (64))); blake2s_8way_state ctx; memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx ); - blake2s_8way_update( &ctx, input + (64<<3), 16 ); - blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES ); - - dintrlv_8x32( output, output+ 32, output+ 64, output+ 96, - output+128, output+160, output+192, output+224, - vhash, 256 ); + blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES ); } int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, @@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, { uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t hash[8*8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[7<<3]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; __m256i *noncev = (__m256i*)vdata + 19; // aligned uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; mm256_bswap32_intrlv80_8x32( vdata, pdata ); blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); @@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, blake2s_8way_hash( hash, vdata ); - - for ( int i = 0; i < 8; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash7[lane] <= Htarg ) ) { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } } n += 8; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; @@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx; void blake2s_4way_hash( void *output, const void *input ) { - uint32_t vhash[8*4] __attribute__ ((aligned (64))); blake2s_4way_state ctx; memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx ); - blake2s_4way_update( &ctx, input + (64<<2), 16 ); - blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES ); - - dintrlv_4x32( output, output+32, output+64, output+96, - vhash, 256 ); + blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES ); } int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, @@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, { uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t hash[8*4] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[7<<2]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; __m128i *noncev = (__m128i*)vdata + 19; // aligned uint32_t n = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES ); @@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, blake2s_4way_hash( hash, vdata ); - for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg ) { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); + extr_lane_4x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } } n += 4; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; diff --git a/algo/blake/blake2s-gate.c b/algo/blake/blake2s-gate.c index a35047f..505c4d5 100644 --- a/algo/blake/blake2s-gate.c +++ b/algo/blake/blake2s-gate.c @@ -2,7 +2,11 @@ bool register_blake2s_algo( algo_gate_t* gate ) { -#if defined(BLAKE2S_8WAY) +#if defined(BLAKE2S_16WAY) + gate->scanhash = (void*)&scanhash_blake2s_16way; + gate->hash = (void*)&blake2s_16way_hash; +#elif defined(BLAKE2S_8WAY) +//#if defined(BLAKE2S_8WAY) gate->scanhash = (void*)&scanhash_blake2s_8way; gate->hash = (void*)&blake2s_8way_hash; #elif defined(BLAKE2S_4WAY) @@ -12,7 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_blake2s; gate->hash = (void*)&blake2s_hash; #endif - gate->optimizations = SSE2_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h index a9c8b8f..4c621b4 100644 --- a/algo/blake/blake2s-gate.h +++ b/algo/blake/blake2s-gate.h @@ -8,13 +8,26 @@ #if defined(__SSE2__) #define BLAKE2S_4WAY #endif + #if defined(__AVX2__) #define BLAKE2S_8WAY #endif +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define BLAKE2S_16WAY +#endif + bool register_blake2s_algo( algo_gate_t* gate ); -#if defined(BLAKE2S_8WAY) +#if defined(BLAKE2S_16WAY) + +void blake2s_16way_hash( void *state, const void *input ); +int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined (BLAKE2S_8WAY) + +//#if defined(BLAKE2S_8WAY) void blake2s_8way_hash( void *state, const void *input ); int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index e1982ba..68e16ab 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -165,13 +165,13 @@ do { \ // // Supported: // 64 + 16 bytes (blake2s with midstate optimization) -// 80 bytes without midstate (blake2s without midstate optimization) +// 80 bytes (blake2s without midstate optimization) // Any multiple of 64 bytes in one shot (x25x) // // Unsupported: -// Stream of 64 byte blocks one at a time. -// -// use for part blocks or when streaming more data +// Stream of full 64 byte blocks one at a time. + +// use only when streaming more data or final block not full. int blake2s_4way_update( blake2s_4way_state *S, const void *in, uint64_t inlen ) { @@ -466,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ) #endif // __AVX2__ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// Blake2s-256 16 way + +int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block ) +{ + __m512i m[16]; + __m512i v[16]; + + memcpy_512( m, block, 16 ); + memcpy_512( v, S->h, 8 ); + + v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL ); + v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL ); + v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL ); + v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL ); + v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ), + m512_const1_64( 0x510E527F510E527FULL ) ); + + v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ), + m512_const1_64( 0x9B05688C9B05688CULL ) ); + + v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ), + m512_const1_64( 0x1F83D9AB1F83D9ABULL ) ); + + v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ), + m512_const1_64( 0x5BE0CD195BE0CD19ULL ) ); + + +#define G16W( sigma0, sigma1, a, b, c, d) \ +do { \ + uint8_t s0 = sigma0; \ + uint8_t s1 = sigma1; \ + a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \ + d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \ + c = _mm512_add_epi32( c, d ); \ + b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \ + a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \ + d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \ + c = _mm512_add_epi32( c, d ); \ + b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \ +} while(0) + +#define ROUND16W(r) \ +do { \ + uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \ + G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \ + G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \ + G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \ + G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \ + G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \ + G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \ + G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \ + G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \ +} while(0) + + ROUND16W( 0 ); + ROUND16W( 1 ); + ROUND16W( 2 ); + ROUND16W( 3 ); + ROUND16W( 4 ); + ROUND16W( 5 ); + ROUND16W( 6 ); + ROUND16W( 7 ); + ROUND16W( 8 ); + ROUND16W( 9 ); + + for( size_t i = 0; i < 8; ++i ) + S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] ); + +#undef G16W +#undef ROUND16W + return 0; +} + +int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen ) +{ + blake2s_nway_param P[1]; + + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + *((uint64_t*)(P->node_offset)) = 0; + P->node_depth = 0; + P->inner_length = 0; + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + memset( S, 0, sizeof( blake2s_16way_state ) ); + S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL ); + S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL ); + S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL ); + S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL ); + S->h[4] = m512_const1_64( 0x510E527F510E527FULL ); + S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL ); + S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL ); + S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL ); + + uint32_t *p = ( uint32_t * )( P ); + + /* IV XOR ParamBlock */ + for ( size_t i = 0; i < 8; ++i ) + S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) ); + return 0; +} + +int blake2s_16way_update( blake2s_16way_state *S, const void *in, + uint64_t inlen ) +{ + __m512i *input = (__m512i*)in; + __m512i *buf = (__m512i*)S->buf; + const int bsize = BLAKE2S_BLOCKBYTES; + + while( inlen > 0 ) + { + size_t left = S->buflen; + if( inlen >= bsize - left ) + { + memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 ); + S->buflen += bsize - left; + S->t[0] += BLAKE2S_BLOCKBYTES; + S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + blake2s_16way_compress( S, buf ); + S->buflen = 0; + input += ( bsize >> 2 ); + inlen -= bsize; + } + else + { + memcpy_512( buf + ( left>>2 ), input, inlen>>2 ); + S->buflen += (size_t) inlen; + input += ( inlen>>2 ); + inlen -= inlen; + } + } + return 0; +} + +int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ) +{ + __m512i *buf = (__m512i*)S->buf; + + S->t[0] += S->buflen; + S->t[1] += ( S->t[0] < S->buflen ); + if ( S->last_node ) + S->f[1] = ~0U; + S->f[0] = ~0U; + + memset_zero_512( buf + ( S->buflen>>2 ), + ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); + blake2s_16way_compress( S, buf ); + + for ( int i = 0; i < 8; ++i ) + casti_m512i( out, i ) = S->h[ i ]; + return 0; +} + +#endif // AVX512 + + #if 0 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) { diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h index a273056..c9b06c4 100644 --- a/algo/blake/blake2s-hash-4way.h +++ b/algo/blake/blake2s-hash-4way.h @@ -64,7 +64,7 @@ typedef struct __blake2s_nway_param ALIGN( 64 ) typedef struct __blake2s_4way_state { __m128i h[8]; - uint8_t buf[ 2 * BLAKE2S_BLOCKBYTES * 4 ]; + uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -75,13 +75,16 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen ); int blake2s_4way_update( blake2s_4way_state *S, const void *in, uint64_t inlen ); int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ); +int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, + const void *input, uint64_t inlen ); + #if defined(__AVX2__) ALIGN( 64 ) typedef struct __blake2s_8way_state { __m256i h[8]; - uint8_t buf[ 2 * BLAKE2S_BLOCKBYTES * 8 ]; + uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -92,9 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen ); int blake2s_8way_update( blake2s_8way_state *S, const void *in, uint64_t inlen ); int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ); -int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, - const void *input, uint64_t inlen ); +//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, +// const void *input, uint64_t inlen ); +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +ALIGN( 128 ) typedef struct __blake2s_16way_state +{ + __m512i h[8]; + uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ]; + uint32_t t[2]; + uint32_t f[2]; + size_t buflen; + uint8_t last_node; +} blake2s_16way_state ; + +int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen ); +int blake2s_16way_update( blake2s_16way_state *S, const void *in, + uint64_t inlen ); +int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ); #endif diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index bb23705..ebf0303 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -78,7 +78,7 @@ void bmw256_4way_addbits_and_close( // BMW-256 8 way 32 typedef struct { - __m256i buf[64]; + __m256i buf[16]; __m256i H[16]; size_t ptr; uint32_t bit_count; // assume bit_count fits in 32 bits @@ -121,7 +121,7 @@ typedef struct { __m256i H[16]; size_t ptr; sph_u64 bit_count; -} bmw_4way_big_context; +} bmw_4way_big_context __attribute__((aligned(128))); typedef bmw_4way_big_context bmw512_4way_context; @@ -137,6 +137,22 @@ void bmw512_4way_addbits_and_close( #endif // __AVX2__ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +typedef struct { + __m512i buf[16]; + __m512i H[16]; + size_t ptr; + uint64_t bit_count; +} bmw512_8way_context __attribute__((aligned(128))); + +void bmw512_8way_init( bmw512_8way_context *ctx ); +void bmw512_8way_update( bmw512_8way_context *ctx, const void *data, + size_t len ); +void bmw512_8way_close( bmw512_8way_context *ctx, void *dst ); + +#endif // AVX512 + #ifdef __cplusplus } #endif diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index b5cda8f..5e869e9 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -137,165 +137,151 @@ static const uint32_t IV256[] = { ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \ add_elt_s( M, H, (i)-16 ) ) +// Expressions are grouped using associativity to reduce CPU depenedencies, +// resulting in some sign changes compared to the reference code. + #define Ws0 \ _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \ + _mm_xor_si128( M[14], H[14] ) ) ) #define Ws1 \ - _mm_sub_epi32( \ + _mm_add_epi32( \ _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) + _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_xor_si128( M[11], H[11] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \ + _mm_xor_si128( M[15], H[15] ) ) ) #define Ws2 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ + _mm_xor_si128( M[15], H[15] ) ) ) #define Ws3 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 1], H[ 1] ) ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \ + _mm_xor_si128( M[13], H[13] ) ) ) #define Ws4 \ _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ), \ + _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \ + _mm_xor_si128( M[14], H[14] ) ) ) #define Ws5 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_xor_si128( M[10], H[10] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ + _mm_xor_si128( M[15], H[15] ) ) ) #define Ws6 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \ + _mm_xor_si128( M[ 0], H[ 0] ) ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \ + _mm_xor_si128( M[13], H[13] ) ) ) #define Ws7 \ _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[12], H[12] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \ + _mm_xor_si128( M[14], H[14] ) ) ) #define Ws8 \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[13], H[13] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) - -#define Ws9 \ _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[14], H[14] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \ + _mm_xor_si128( M[15], H[15] ) ) ) +#define Ws9 \ + _mm_sub_epi32( \ + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \ + _mm_xor_si128( M[14], H[14] ) ) ) #define Ws10 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 1], H[ 1] ) ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[15], H[15] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ + _mm_xor_si128( M[ 1], H[ 1] ) ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \ + _mm_xor_si128( M[15], H[15] ) ) ) #define Ws11 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ - _mm_xor_si128( M[ 0], H[ 0] ) ), \ - _mm_xor_si128( M[ 2], H[ 2] ) ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \ + _mm_xor_si128( M[ 0], H[ 0] ) ), \ + _mm_xor_si128( M[ 2], H[ 2] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \ + _mm_xor_si128( M[ 9], H[ 9] ) ) ) #define Ws12 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ - _mm_xor_si128( M[ 3], H[ 3] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[10], H[10] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \ + _mm_xor_si128( M[ 3], H[ 3] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \ + _mm_xor_si128( M[10], H[10] ) ) ) #define Ws13 \ _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( \ - _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 7], H[ 7] ) ), \ - _mm_xor_si128( M[10], H[10] ) ), \ - _mm_xor_si128( M[11], H[11] ) ) + _mm_add_epi32( \ + _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \ + _mm_xor_si128( M[ 4], H[ 4] ) ), \ + _mm_xor_si128( M[ 7], H[ 7] ) ), \ + _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \ + _mm_xor_si128( M[11], H[11] ) ) ) #define Ws14 \ _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_add_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ - _mm_xor_si128( M[ 5], H[ 5] ) ), \ - _mm_xor_si128( M[ 8], H[ 8] ) ), \ - _mm_xor_si128( M[11], H[11] ) ), \ - _mm_xor_si128( M[12], H[12] ) ) + _mm_add_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \ + _mm_xor_si128( M[ 5], H[ 5] ) ), \ + _mm_xor_si128( M[ 8], H[ 8] ) ), \ + _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \ + _mm_xor_si128( M[12], H[12] ) ) ) #define Ws15 \ - _mm_add_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( \ - _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ - _mm_xor_si128( M[ 4], H[ 4] ) ), \ - _mm_xor_si128( M[ 6], H[ 6] ) ), \ - _mm_xor_si128( M[ 9], H[ 9] ) ), \ - _mm_xor_si128( M[13], H[13] ) ) + _mm_sub_epi32( \ + _mm_sub_epi32( \ + _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \ + _mm_xor_si128( M[ 4], H[4] ) ), \ + _mm_xor_si128( M[ 6], H[ 6] ) ), \ + _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \ + _mm_xor_si128( M[13], H[13] ) ) ) void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] ) @@ -700,163 +686,148 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define W8s0 \ _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_add_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_xor_si256( M[10], H[10] ) ), \ + _mm256_add_epi32( _mm256_xor_si256( M[13], H[13] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define W8s1 \ - _mm256_sub_epi32( \ + _mm256_add_epi32( \ _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_xor_si256( M[11], H[11] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[14], H[14] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define W8s2 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi32( \ + _mm256_add_epi32( \ + _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define W8s3 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi32( \ + _mm256_add_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 1], H[ 1] ) ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[10], H[10] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) #define W8s4 \ _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_add_epi32( \ + _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ), \ + _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define W8s5 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi32( \ + _mm256_add_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_xor_si256( M[10], H[10] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define W8s6 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \ + _mm256_xor_si256( M[ 0], H[ 0] ) ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) #define W8s7 \ _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_add_epi32( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define W8s8 \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_add_epi32( \ + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[13], H[13] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define W8s9 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_sub_epi32( \ + _mm256_add_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define W8s10 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ + _mm256_xor_si256( M[ 1], H[ 1] ) ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define W8s11 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \ + _mm256_xor_si256( M[ 0], H[ 0] ) ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ) ) #define W8s12 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( \ + _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \ + _mm256_xor_si256( M[10], H[10] ) ) ) #define W8s13 \ _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( \ - _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ) + _mm256_add_epi32( \ + _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_add_epi32( _mm256_xor_si256( M[10], H[10] ), \ + _mm256_xor_si256( M[11], H[11] ) ) ) #define W8s14 \ _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_add_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ) + _mm256_add_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[12], H[12] ) ) ) #define W8s15 \ - _mm256_add_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( \ - _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi32( \ + _mm256_sub_epi32( \ + _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[ 4], H[4] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) + void compress_small_8way( const __m256i *M, const __m256i H[16], __m256i dH[16] ) diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c index 9142e72..85235e2 100644 --- a/algo/bmw/bmw512-4way.c +++ b/algo/bmw/bmw512-4way.c @@ -1,13 +1,66 @@ #include "bmw512-gate.h" - -#ifdef BMW512_4WAY - #include #include #include //#include "sph_keccak.h" #include "bmw-hash-4way.h" +#if defined(BMW512_8WAY) + +void bmw512hash_8way(void *state, const void *input) +{ + bmw512_8way_context ctx; + bmw512_8way_init( &ctx ); + bmw512_8way_update( &ctx, input, 80 ); + bmw512_8way_close( &ctx, state ); +} + +int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t hash[16*8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[49]); // 3*16+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + __m512i *noncev = (__m512i*)vdata + 9; // aligned +// const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 , + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + bmw512hash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 4; + + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return 0; +} + + +#elif defined(BMW512_4WAY) + +//#ifdef BMW512_4WAY + void bmw512hash_4way(void *state, const void *input) { bmw512_4way_context ctx; diff --git a/algo/bmw/bmw512-gate.c b/algo/bmw/bmw512-gate.c index 48277ed..fb7d0d3 100644 --- a/algo/bmw/bmw512-gate.c +++ b/algo/bmw/bmw512-gate.c @@ -2,9 +2,12 @@ bool register_bmw512_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT; + gate->optimizations = AVX2_OPT | AVX512_OPT; opt_target_factor = 256.0; -#if defined (BMW512_4WAY) +#if defined (BMW512_8WAY) + gate->scanhash = (void*)&scanhash_bmw512_8way; + gate->hash = (void*)&bmw512hash_8way; +#elif defined (BMW512_4WAY) gate->scanhash = (void*)&scanhash_bmw512_4way; gate->hash = (void*)&bmw512hash_4way; #else diff --git a/algo/bmw/bmw512-gate.h b/algo/bmw/bmw512-gate.h index 9aeb519..4c7fb41 100644 --- a/algo/bmw/bmw512-gate.h +++ b/algo/bmw/bmw512-gate.h @@ -1,23 +1,33 @@ #ifndef BMW512_GATE_H__ -#define BMW512_GATE_H__ +#define BMW512_GATE_H__ 1 #include "algo-gate-api.h" #include -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define BMW512_8WAY 1 +#elif defined(__AVX2__) #define BMW512_4WAY 1 #endif -#if defined(BMW512_4WAY) +#if defined(BMW512_8WAY) + +void bmw512hash_8way( void *state, const void *input ); +int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(BMW512_4WAY) void bmw512hash_4way( void *state, const void *input ); int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#endif +#else void bmw512hash( void *state, const void *input ); int scanhash_bmw512( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif + +#endif diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index e893c87..4e6e29d 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -556,7 +556,7 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) compress_big_2way( buf, h, h2 ); memcpy_128( buf, h2, 16 ); compress_big_2way( buf, final_b2, h1 ); - memcpy( (__m128i*)dst, h1+16, 8 ); + memcpy( (__m128i*)dst, h1+8, 8 ); } #endif // __SSE2__ @@ -636,165 +636,152 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \ add_elt_b( M, H, (i)-16 ) ) + + #define Wb0 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_add_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_xor_si256( M[10], H[10] ) ), \ + _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define Wb1 \ - _mm256_sub_epi64( \ + _mm256_add_epi64( \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_xor_si256( M[11], H[11] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define Wb2 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi64( \ + _mm256_add_epi64( \ + _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define Wb3 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi64( \ + _mm256_add_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 1], H[ 1] ) ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) #define Wb4 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_add_epi64( \ + _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ), \ + _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define Wb5 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi64( \ + _mm256_add_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_xor_si256( M[10], H[10] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define Wb6 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \ + _mm256_xor_si256( M[ 0], H[ 0] ) ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) #define Wb7 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define Wb8 \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_add_epi64( \ + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define Wb9 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[14], H[14] ) ) + _mm256_sub_epi64( \ + _mm256_add_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ + _mm256_xor_si256( M[14], H[14] ) ) ) #define Wb10 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[15], H[15] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ + _mm256_xor_si256( M[ 1], H[ 1] ) ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ + _mm256_xor_si256( M[15], H[15] ) ) ) #define Wb11 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ + _mm256_xor_si256( M[ 0], H[ 0] ) ), \ + _mm256_xor_si256( M[ 2], H[ 2] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ + _mm256_xor_si256( M[ 9], H[ 9] ) ) ) #define Wb12 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( \ + _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ + _mm256_xor_si256( M[ 3], H[ 3] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ + _mm256_xor_si256( M[10], H[10] ) ) ) #define Wb13 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ) + _mm256_add_epi64( \ + _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ + _mm256_xor_si256( M[ 4], H[ 4] ) ), \ + _mm256_xor_si256( M[ 7], H[ 7] ) ), \ + _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \ + _mm256_xor_si256( M[11], H[11] ) ) ) #define Wb14 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_xor_si256( M[12], H[12] ) ) + _mm256_add_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ + _mm256_xor_si256( M[ 5], H[ 5] ) ), \ + _mm256_xor_si256( M[ 8], H[ 8] ) ), \ + _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ + _mm256_xor_si256( M[12], H[12] ) ) ) #define Wb15 \ - _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_xor_si256( M[13], H[13] ) ) + _mm256_sub_epi64( \ + _mm256_sub_epi64( \ + _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ + _mm256_xor_si256( M[ 4], H[4] ) ), \ + _mm256_xor_si256( M[ 6], H[ 6] ) ), \ + _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ + _mm256_xor_si256( M[13], H[13] ) ) ) + void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) { @@ -1079,6 +1066,477 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #endif // __AVX2__ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// BMW-512 8 WAY + +#define s8b0(x) \ + mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 3), \ + mm512_rol_64( (x), 4), mm512_rol_64( (x),37) ) + +#define s8b1(x) \ + mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 2), \ + mm512_rol_64( (x),13), mm512_rol_64( (x),43) ) + +#define s8b2(x) \ + mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 1), \ + mm512_rol_64( (x),19), mm512_rol_64( (x),53) ) + +#define s8b3(x) \ + mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 2), \ + mm512_rol_64( (x),28), mm512_rol_64( (x),59) ) + +#define s8b4(x) \ + _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 1 ) ) + +#define s8b5(x) \ + _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 2 ) ) + +#define r8b1(x) mm512_rol_64( x, 5 ) +#define r8b2(x) mm512_rol_64( x, 11 ) +#define r8b3(x) mm512_rol_64( x, 27 ) +#define r8b4(x) mm512_rol_64( x, 32 ) +#define r8b5(x) mm512_rol_64( x, 37 ) +#define r8b6(x) mm512_rol_64( x, 43 ) +#define r8b7(x) mm512_rol_64( x, 53 ) + +#define rol8w_off_64( M, j, off ) \ + mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ + ( ( (j) + (off) ) & 0xF ) + 1 ) + +#define add_elt_b8( M, H, j ) \ + _mm512_xor_si512( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \ + rol8w_off_64( M, j, 3 ) ), \ + rol8w_off_64( M, j, 10 ) ), \ + _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ + H[ ( (j)+7 ) & 0xF ] ) + +#define expand1b8( qt, M, H, i ) \ + _mm512_add_epi64( mm512_add4_64( \ + mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \ + s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \ + mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \ + s8b3( qt[ (i)-10 ] ), s8b0( qt[ (i)- 9 ] )), \ + mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \ + s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \ + mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \ + s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \ + add_elt_b8( M, H, (i)-16 ) ) + +#define expand2b8( qt, M, H, i) \ + _mm512_add_epi64( mm512_add4_64( \ + mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \ + qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \ + mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \ + qt[ (i)-10 ], r8b4( qt[ (i)- 9 ] ) ), \ + mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \ + qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \ + mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \ + s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \ + add_elt_b8( M, H, (i)-16 ) ) + + + +#define W8b0 \ + _mm512_add_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ + _mm512_xor_si512( M[ 7], H[ 7] ) ), \ + _mm512_xor_si512( M[10], H[10] ) ), \ + _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \ + _mm512_xor_si512( M[14], H[14] ) ) ) + +#define W8b1 \ + _mm512_add_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \ + _mm512_xor_si512( M[ 8], H[ 8] ) ), \ + _mm512_xor_si512( M[11], H[11] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \ + _mm512_xor_si512( M[15], H[15] ) ) ) + +#define W8b2 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ + _mm512_xor_si512( M[ 7], H[ 7] ) ), \ + _mm512_xor_si512( M[ 9], H[ 9] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ + _mm512_xor_si512( M[15], H[15] ) ) ) + +#define W8b3 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ + _mm512_xor_si512( M[ 1], H[ 1] ) ), \ + _mm512_xor_si512( M[ 8], H[ 8] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \ + _mm512_xor_si512( M[13], H[13] ) ) ) + +#define W8b4 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ + _mm512_xor_si512( M[ 2], H[ 2] ) ), \ + _mm512_xor_si512( M[ 9], H[ 9] ) ), \ + _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ + _mm512_xor_si512( M[14], H[14] ) ) ) + +#define W8b5 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ + _mm512_xor_si512( M[ 2], H[ 2] ) ), \ + _mm512_xor_si512( M[10], H[10] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ + _mm512_xor_si512( M[15], H[15] ) ) ) + +#define W8b6 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \ + _mm512_xor_si512( M[ 0], H[ 0] ) ), \ + _mm512_xor_si512( M[ 3], H[ 3] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \ + _mm512_xor_si512( M[13], H[13] ) ) ) + +#define W8b7 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ + _mm512_xor_si512( M[ 4], H[ 4] ) ), \ + _mm512_xor_si512( M[ 5], H[ 5] ) ), \ + _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \ + _mm512_xor_si512( M[14], H[14] ) ) ) + +#define W8b8 \ + _mm512_add_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ + _mm512_xor_si512( M[ 5], H[ 5] ) ), \ + _mm512_xor_si512( M[ 6], H[ 6] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \ + _mm512_xor_si512( M[15], H[15] ) ) ) + +#define W8b9 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ + _mm512_xor_si512( M[ 3], H[ 3] ) ), \ + _mm512_xor_si512( M[ 6], H[ 6] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ + _mm512_xor_si512( M[14], H[14] ) ) ) + +#define W8b10 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ + _mm512_xor_si512( M[ 1], H[ 1] ) ), \ + _mm512_xor_si512( M[ 4], H[ 4] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ + _mm512_xor_si512( M[15], H[15] ) ) ) + +#define W8b11 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ + _mm512_xor_si512( M[ 0], H[ 0] ) ), \ + _mm512_xor_si512( M[ 2], H[ 2] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ + _mm512_xor_si512( M[ 9], H[ 9] ) ) ) + +#define W8b12 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ + _mm512_xor_si512( M[ 3], H[ 3] ) ), \ + _mm512_xor_si512( M[ 6], H[ 6] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ + _mm512_xor_si512( M[10], H[10] ) ) ) + +#define W8b13 \ + _mm512_add_epi64( \ + _mm512_add_epi64( \ + _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ + _mm512_xor_si512( M[ 4], H[ 4] ) ), \ + _mm512_xor_si512( M[ 7], H[ 7] ) ), \ + _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \ + _mm512_xor_si512( M[11], H[11] ) ) ) + +#define W8b14 \ + _mm512_sub_epi64( \ + _mm512_add_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ + _mm512_xor_si512( M[ 5], H[ 5] ) ), \ + _mm512_xor_si512( M[ 8], H[ 8] ) ), \ + _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ + _mm512_xor_si512( M[12], H[12] ) ) ) + +#define W8b15 \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( \ + _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ + _mm512_xor_si512( M[ 4], H[4] ) ), \ + _mm512_xor_si512( M[ 6], H[ 6] ) ), \ + _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ + _mm512_xor_si512( M[13], H[13] ) ) ) + +void compress_big_8way( const __m512i *M, const __m512i H[16], + __m512i dH[16] ) +{ + __m512i qt[32], xl, xh; + + qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] ); + qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] ); + qt[ 2] = _mm512_add_epi64( s8b2( W8b2 ), H[ 3] ); + qt[ 3] = _mm512_add_epi64( s8b3( W8b3 ), H[ 4] ); + qt[ 4] = _mm512_add_epi64( s8b4( W8b4 ), H[ 5] ); + qt[ 5] = _mm512_add_epi64( s8b0( W8b5 ), H[ 6] ); + qt[ 6] = _mm512_add_epi64( s8b1( W8b6 ), H[ 7] ); + qt[ 7] = _mm512_add_epi64( s8b2( W8b7 ), H[ 8] ); + qt[ 8] = _mm512_add_epi64( s8b3( W8b8 ), H[ 9] ); + qt[ 9] = _mm512_add_epi64( s8b4( W8b9 ), H[10] ); + qt[10] = _mm512_add_epi64( s8b0( W8b10), H[11] ); + qt[11] = _mm512_add_epi64( s8b1( W8b11), H[12] ); + qt[12] = _mm512_add_epi64( s8b2( W8b12), H[13] ); + qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] ); + qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); + qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); + qt[16] = expand1b8( qt, M, H, 16 ); + qt[17] = expand1b8( qt, M, H, 17 ); + qt[18] = expand2b8( qt, M, H, 18 ); + qt[19] = expand2b8( qt, M, H, 19 ); + qt[20] = expand2b8( qt, M, H, 20 ); + qt[21] = expand2b8( qt, M, H, 21 ); + qt[22] = expand2b8( qt, M, H, 22 ); + qt[23] = expand2b8( qt, M, H, 23 ); + qt[24] = expand2b8( qt, M, H, 24 ); + qt[25] = expand2b8( qt, M, H, 25 ); + qt[26] = expand2b8( qt, M, H, 26 ); + qt[27] = expand2b8( qt, M, H, 27 ); + qt[28] = expand2b8( qt, M, H, 28 ); + qt[29] = expand2b8( qt, M, H, 29 ); + qt[30] = expand2b8( qt, M, H, 30 ); + qt[31] = expand2b8( qt, M, H, 31 ); + + xl = _mm512_xor_si512( + mm512_xor4( qt[16], qt[17], qt[18], qt[19] ), + mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) ); + xh = _mm512_xor_si512( xl, _mm512_xor_si512( + mm512_xor4( qt[24], qt[25], qt[26], qt[27] ), + mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); + +#define DH1( m, sl, sr, a, b, c ) \ + _mm512_add_epi64( \ + _mm512_xor_si512( M[m], \ + _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \ + _mm512_srli_epi64( qt[a], sr ) ) ), \ + _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + +#define DHL( m, rl, sl, h, a, b, c ) \ + _mm512_add_epi64( _mm512_add_epi64( \ + mm512_rol_64( dH[h], rl ), \ + _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ + _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \ + _mm512_xor_si512( qt[b], qt[c] ) ) ); + +#define DHR( m, rl, sr, h, a, b, c ) \ + _mm512_add_epi64( _mm512_add_epi64( \ + mm512_rol_64( dH[h], rl ), \ + _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ + _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \ + _mm512_xor_si512( qt[b], qt[c] ) ) ); + + + dH[ 0] = DH1( 0, 5, 5, 16, 24, 0 ); + dH[ 1] = DH1( 1, 7, 8, 17, 25, 1 ); + dH[ 2] = DH1( 2, 5, 5, 18, 26, 2 ); + dH[ 3] = DH1( 3, 1, 5, 19, 27, 3 ); + dH[ 4] = DH1( 4, 3, 0, 20, 28, 4 ); + dH[ 5] = DH1( 5, 6, 6, 21, 29, 5 ); + dH[ 6] = DH1( 6, 4, 6, 22, 30, 6 ); + dH[ 7] = DH1( 7, 11, 2, 23, 31, 7 ); + dH[ 8] = DHL( 8, 9, 8, 4, 24, 23, 8 ); + dH[ 9] = DHR( 9, 10, 6, 5, 25, 16, 9 ); + dH[10] = DHL( 10, 11, 6, 6, 26, 17, 10 ); + dH[11] = DHL( 11, 12, 4, 7, 27, 18, 11 ); + dH[12] = DHR( 12, 13, 3, 0, 28, 19, 12 ); + dH[13] = DHR( 13, 14, 4, 1, 29, 20, 13 ); + dH[14] = DHR( 14, 15, 7, 2, 30, 21, 14 ); + dH[15] = DHR( 15, 16, 2, 3, 31, 22, 15 ); + +#undef DH1 +#undef DHL +#undef DHR + +} + +static const __m512i final_b8[16] = +{ + { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0, + 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0, + 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0, + 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 }, + { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1, + 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1, + 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1, + 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 }, + { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, + 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, + 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, + 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 }, + { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3, + 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3, + 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3, + 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 }, + { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4, + 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4, + 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4, + 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 }, + { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5, + 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5, + 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5, + 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 }, + { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6, + 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6, + 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6, + 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 }, + { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7, + 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7, + 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7, + 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 }, + { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8, + 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8, + 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8, + 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 }, + { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9, + 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9, + 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9, + 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 }, + { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, + 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, + 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, + 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, + { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab, + 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab, + 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab, + 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab }, + { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac, + 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac, + 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac, + 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac }, + { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad, + 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad, + 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad, + 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad }, + { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae, + 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae, + 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae, + 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae }, + { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf, + 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf, + 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf, + 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf } +}; + + +void bmw512_8way_init( bmw512_8way_context *ctx ) +//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv ) +{ + ctx->H[ 0] = m512_const1_64( 0x8081828384858687 ); + ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F ); + ctx->H[ 2] = m512_const1_64( 0x9091929394959697 ); + ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F ); + ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 ); + ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF ); + ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 ); + ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF ); + ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 ); + ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF ); + ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 ); + ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF ); + ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 ); + ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF ); + ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 ); + ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF ); + ctx->ptr = 0; + ctx->bit_count = 0; +} + +void bmw512_8way_update( bmw512_8way_context *ctx, const void *data, + size_t len ) +{ + __m512i *vdata = (__m512i*)data; + __m512i *buf; + __m512i htmp[16]; + __m512i *h1, *h2; + size_t ptr; + const int buf_size = 128; // bytes of one lane, compatible with len + + ctx->bit_count += len << 3; + buf = ctx->buf; + ptr = ctx->ptr; + h1 = ctx->H; + h2 = htmp; + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_512( buf + (ptr>>3), vdata, clen >> 3 ); + vdata = vdata + (clen>>3); + len -= clen; + ptr += clen; + if ( ptr == buf_size ) + { + __m512i *ht; + compress_big_8way( buf, h1, h2 ); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + ctx->ptr = ptr; + if ( h1 != ctx->H ) + memcpy_512( ctx->H, h1, 16 ); +} + +void bmw512_8way_close( bmw512_8way_context *ctx, void *dst ) +{ + __m512i *buf; + __m512i h1[16], h2[16], *h; + size_t ptr, u, v; + const int buf_size = 128; // bytes of one lane, compatible with len + + buf = ctx->buf; + ptr = ctx->ptr; + buf[ ptr>>3 ] = m512_const1_64( 0x80 ); + ptr += 8; + h = ctx->H; + + if ( ptr > (buf_size - 8) ) + { + memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + compress_big_8way( buf, h, h1 ); + ptr = 0; + h = h1; + } + memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 ); + buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( ctx->bit_count ); + compress_big_8way( buf, h, h2 ); + for ( u = 0; u < 16; u ++ ) + buf[ u ] = h2[ u ]; + compress_big_8way( buf, final_b8, h1 ); + for (u = 0, v = 8; u < 8; u ++, v ++) + casti_m512i( dst, u ) = h1[ v ]; +} + +#endif // AVX512 + #ifdef __cplusplus } #endif diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index f7be5bb..73f5d9c 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -1,18 +1,68 @@ #include "keccak-gate.h" - -#ifdef KECCAK_4WAY - #include #include #include #include "sph_keccak.h" #include "keccak-hash-4way.h" +#if defined(KECCAK_8WAY) + +void keccakhash_8way(void *state, const void *input) +{ + keccak256_8way_context ctx; + keccak256_8way_init( &ctx ); + keccak256_8way_update( &ctx, input, 80 ); + keccak256_8way_close( &ctx, state ); +} + +int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[24*8] __attribute__ ((aligned (128))); + uint32_t hash[16*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); // 3*16+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + keccakhash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane<<1 ] < Htarg ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return 0; +} + +#elif defined(KECCAK_4WAY) + void keccakhash_4way(void *state, const void *input) { keccak256_4way_context ctx; keccak256_4way_init( &ctx ); - keccak256_4way( &ctx, input, 80 ); + keccak256_4way_update( &ctx, input, 80 ); keccak256_4way_close( &ctx, state ); } @@ -28,8 +78,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; __m256i *noncev = (__m256i*)vdata + 9; // aligned -// const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; mm256_bswap32_intrlv80_4x64( vdata, pdata ); do { @@ -39,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, keccakhash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) + if ( hash7[ lane<<1 ] < Htarg ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 0ebc2d4..c8334a2 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -3,30 +3,36 @@ bool register_keccak_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT; + gate->optimizations = AVX2_OPT | AVX512_OPT; gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; opt_target_factor = 128.0; -#if defined (KECCAK_4WAY) +#if defined (KECCAK_8WAY) + gate->scanhash = (void*)&scanhash_keccak_8way; + gate->hash = (void*)&keccakhash_8way; +#elif defined (KECCAK_4WAY) gate->scanhash = (void*)&scanhash_keccak_4way; gate->hash = (void*)&keccakhash_4way; #else - gate->scanhash = (void*)&scanhash_keccak; - gate->hash = (void*)&keccakhash; + gate->scanhash = (void*)&scanhash_keccak; + gate->hash = (void*)&keccakhash; #endif return true; }; bool register_keccakc_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT; + gate->optimizations = AVX2_OPT | AVX512_OPT; gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root; opt_target_factor = 256.0; -#if defined (KECCAK_4WAY) +#if defined (KECCAK_8WAY) + gate->scanhash = (void*)&scanhash_keccak_8way; + gate->hash = (void*)&keccakhash_8way; +#elif defined (KECCAK_4WAY) gate->scanhash = (void*)&scanhash_keccak_4way; gate->hash = (void*)&keccakhash_4way; #else - gate->scanhash = (void*)&scanhash_keccak; - gate->hash = (void*)&keccakhash; + gate->scanhash = (void*)&scanhash_keccak; + gate->hash = (void*)&keccakhash; #endif return true; }; diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h index e9fc5e7..0b78450 100644 --- a/algo/keccak/keccak-gate.h +++ b/algo/keccak/keccak-gate.h @@ -1,23 +1,33 @@ #ifndef KECCAK_GATE_H__ -#define KECCAK_GATE_H__ +#define KECCAK_GATE_H__ 1 #include "algo-gate-api.h" #include -#if defined(__AVX2__) - #define KECCAK_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define KECCAK_8WAY 1 +#elif defined(__AVX2__) + #define KECCAK_4WAY 1 #endif -#if defined(KECCAK_4WAY) +#if defined(KECCAK_8WAY) + +void keccakhash_8way( void *state, const void *input ); +int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(KECCAK_4WAY) void keccakhash_4way( void *state, const void *input ); int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#endif +#else void keccakhash( void *state, const void *input ); int scanhash_keccak( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif + +#endif diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index bb31081..4108be6 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -1,23 +1,24 @@ #include +#include #include "keccak-hash-4way.h" -#if defined(__AVX2__) - -static const sph_u64 RC[] = { - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +static const uint64_t RC[] = { + 0x0000000000000001, 0x0000000000008082, + 0x800000000000808A, 0x8000000080008000, + 0x000000000000808B, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, + 0x000000000000008A, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000A, + 0x000000008000808B, 0x800000000000008B, + 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, + 0x000000000000800A, 0x800000008000000A, + 0x8000000080008081, 0x8000000000008080, + 0x0000000080000001, 0x8000000080008008 }; +// generic macros + #define a00 (kc->w[ 0]) #define a10 (kc->w[ 1]) #define a20 (kc->w[ 2]) @@ -48,6 +49,197 @@ static const sph_u64 RC[] = { #define READ_STATE(sc) #define WRITE_STATE(sc) +#define MOV64(d, s) (d = s) +#define XOR64_IOTA XOR64 + +#define LPAR ( +#define RPAR ) + +#define DO(x) x + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size>>3); j++ ) \ + kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \ +} while (0) + +// Targetted macros, keccak-macros.h is included for each target. + +#define DECL64(x) __m512i x +#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) +#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) +#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) +#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) +#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) + +#include "keccak-macros.c" + +#define KECCAK_F_1600 DO(KECCAK_F_1600_512) + +#define KECCAK_F_1600_512 do { \ + int j; \ + for (j = 0; j < 24; j += 8) \ + { \ + KF_ELT( 0, 1, _mm512_set1_epi64( RC[j + 0] ) ); \ + KF_ELT( 1, 2, _mm512_set1_epi64( RC[j + 1] ) ); \ + KF_ELT( 2, 3, _mm512_set1_epi64( RC[j + 2] ) ); \ + KF_ELT( 3, 4, _mm512_set1_epi64( RC[j + 3] ) ); \ + KF_ELT( 4, 5, _mm512_set1_epi64( RC[j + 4] ) ); \ + KF_ELT( 5, 6, _mm512_set1_epi64( RC[j + 5] ) ); \ + KF_ELT( 6, 7, _mm512_set1_epi64( RC[j + 6] ) ); \ + KF_ELT( 7, 8, _mm512_set1_epi64( RC[j + 7] ) ); \ + P8_TO_P0; \ + } \ +} while (0) + +static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size ) +{ + __m512i zero = m512_zero; + __m512i neg1 = m512_neg1; + + // Initialization for the "lane complement". + kc->w[ 0] = zero; kc->w[ 1] = neg1; + kc->w[ 2] = neg1; kc->w[ 3] = zero; + kc->w[ 4] = zero; kc->w[ 5] = zero; + kc->w[ 6] = zero; kc->w[ 7] = zero; + kc->w[ 8] = neg1; kc->w[ 9] = zero; + kc->w[10] = zero; kc->w[11] = zero; + kc->w[12] = neg1; kc->w[13] = zero; + kc->w[14] = zero; kc->w[15] = zero; + kc->w[16] = zero; kc->w[17] = neg1; + kc->w[18] = zero; kc->w[19] = zero; + kc->w[20] = neg1; kc->w[21] = zero; + kc->w[22] = zero; kc->w[23] = zero; + kc->w[24] = zero; kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len, + size_t lim ) +{ + __m512i *buf; + __m512i *vdata = (__m512i*)data; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if ( len < (lim - ptr) ) + { + memcpy_512( buf + (ptr>>3), vdata, len>>3 ); + kc->ptr = ptr + len; + return; + } + READ_STATE( kc ); + while ( len > 0 ) + { + size_t clen; + + clen = (lim - ptr); + if ( clen > len ) + clen = len; + memcpy_512( buf + (ptr>>3), vdata, clen>>3 ); + ptr += clen; + vdata = vdata + (clen>>3); + len -= clen; + if ( ptr == lim ) + { + INPUT_BUF( lim ); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE( kc ); + kc->ptr = ptr; +} + +static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst, + size_t byte_len, size_t lim ) +{ + unsigned eb; + union { + __m512i tmp[lim + 1]; + sph_u64 dummy; /* for alignment */ + } u; + size_t j; + size_t m512_len = byte_len >> 3; + + eb = 0x100 >> 8; + if ( kc->ptr == (lim - 8) ) + { + const uint64_t t = eb | 0x8000000000000000; + u.tmp[0] = m512_const1_64( t ); + j = 8; + } + else + { + j = lim - kc->ptr; + u.tmp[0] = m512_const1_64( eb ); + memset_zero_512( u.tmp + 1, (j>>3) - 2 ); + u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 ); + } + keccak64_8way_core( kc, u.tmp, j, lim ); + /* Finalize the "lane complement" */ + NOT64( kc->w[ 1], kc->w[ 1] ); + NOT64( kc->w[ 2], kc->w[ 2] ); + NOT64( kc->w[ 8], kc->w[ 8] ); + NOT64( kc->w[12], kc->w[12] ); + NOT64( kc->w[17], kc->w[17] ); + NOT64( kc->w[20], kc->w[20] ); + memcpy_512( dst, kc->w, m512_len ); +} + +void keccak256_8way_init( void *kc ) +{ + keccak64_8way_init( kc, 256 ); +} + +void +keccak256_8way_update(void *cc, const void *data, size_t len) +{ + keccak64_8way_core(cc, data, len, 136); +} + +void +keccak256_8way_close(void *cc, void *dst) +{ + keccak64_8way_close(cc, dst, 32, 136); +} + +void keccak512_8way_init( void *kc ) +{ + keccak64_8way_init( kc, 512 ); +} + +void +keccak512_8way_update(void *cc, const void *data, size_t len) +{ + keccak64_8way_core(cc, data, len, 72); +} + +void +keccak512_8way_close(void *cc, void *dst) +{ + keccak64_8way_close(cc, dst, 64, 72); +} + +#undef INPUT_BUF +#undef DECL64 +#undef XOR64 +#undef AND64 +#undef OR64 +#undef NOT64 +#undef ROL64 +#undef KECCAK_F_1600 + +#endif // AVX512 + +#if defined(__AVX2__) + #define INPUT_BUF(size) do { \ size_t j; \ for (j = 0; j < (size>>3); j++ ) \ @@ -55,314 +247,28 @@ static const sph_u64 RC[] = { } while (0) #define DECL64(x) __m256i x -#define MOV64(d, s) (d = s) #define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) #define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n)) -#define XOR64_IOTA XOR64 -#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ - DECL64(tt0); \ - DECL64(tt1); \ - DECL64(tt2); \ - DECL64(tt3); \ - XOR64(tt0, d0, d1); \ - XOR64(tt1, d2, d3); \ - XOR64(tt0, tt0, d4); \ - XOR64(tt0, tt0, tt1); \ - ROL64(tt0, tt0, 1); \ - XOR64(tt2, c0, c1); \ - XOR64(tt3, c2, c3); \ - XOR64(tt0, tt0, c4); \ - XOR64(tt2, tt2, tt3); \ - XOR64(t, tt0, tt2); \ - } while (0) +#include "keccak-macros.c" -#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(t0); \ - DECL64(t1); \ - DECL64(t2); \ - DECL64(t3); \ - DECL64(t4); \ - TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ - TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ - TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ - TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ - TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ - XOR64(b00, b00, t0); \ - XOR64(b01, b01, t0); \ - XOR64(b02, b02, t0); \ - XOR64(b03, b03, t0); \ - XOR64(b04, b04, t0); \ - XOR64(b10, b10, t1); \ - XOR64(b11, b11, t1); \ - XOR64(b12, b12, t1); \ - XOR64(b13, b13, t1); \ - XOR64(b14, b14, t1); \ - XOR64(b20, b20, t2); \ - XOR64(b21, b21, t2); \ - XOR64(b22, b22, t2); \ - XOR64(b23, b23, t2); \ - XOR64(b24, b24, t2); \ - XOR64(b30, b30, t3); \ - XOR64(b31, b31, t3); \ - XOR64(b32, b32, t3); \ - XOR64(b33, b33, t3); \ - XOR64(b34, b34, t3); \ - XOR64(b40, b40, t4); \ - XOR64(b41, b41, t4); \ - XOR64(b42, b42, t4); \ - XOR64(b43, b43, t4); \ - XOR64(b44, b44, t4); \ - } while (0) +#define KECCAK_F_1600 DO(KECCAK_F_1600_256) -#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - /* ROL64(b00, b00, 0); */ \ - ROL64(b01, b01, 36); \ - ROL64(b02, b02, 3); \ - ROL64(b03, b03, 41); \ - ROL64(b04, b04, 18); \ - ROL64(b10, b10, 1); \ - ROL64(b11, b11, 44); \ - ROL64(b12, b12, 10); \ - ROL64(b13, b13, 45); \ - ROL64(b14, b14, 2); \ - ROL64(b20, b20, 62); \ - ROL64(b21, b21, 6); \ - ROL64(b22, b22, 43); \ - ROL64(b23, b23, 15); \ - ROL64(b24, b24, 61); \ - ROL64(b30, b30, 28); \ - ROL64(b31, b31, 55); \ - ROL64(b32, b32, 25); \ - ROL64(b33, b33, 21); \ - ROL64(b34, b34, 56); \ - ROL64(b40, b40, 27); \ - ROL64(b41, b41, 20); \ - ROL64(b42, b42, 39); \ - ROL64(b43, b43, 8); \ - ROL64(b44, b44, 14); \ - } while (0) - -/* - * The KHI macro integrates the "lane complement" optimization. On input, - * some words are complemented: - * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 - * On output, the following words are complemented: - * a04 a10 a20 a22 a23 a31 - * - * The (implicit) permutation and the theta expansion will bring back - * the input mask for the next round. - */ - -#define KHI_XO(d, a, b, c) do { \ - DECL64(kt); \ - OR64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI_XA(d, a, b, c) do { \ - DECL64(kt); \ - AND64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(c0); \ - DECL64(c1); \ - DECL64(c2); \ - DECL64(c3); \ - DECL64(c4); \ - DECL64(bnn); \ - NOT64(bnn, b20); \ - KHI_XO(c0, b00, b10, b20); \ - KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ - MOV64(b00, c0); \ - MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ - NOT64(bnn, b41); \ - KHI_XO(c0, b01, b11, b21); \ - KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ - MOV64(b01, c0); \ - MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ - NOT64(bnn, b32); \ - KHI_XO(c0, b02, b12, b22); \ - KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ - MOV64(b02, c0); \ - MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ - NOT64(bnn, b33); \ - KHI_XA(c0, b03, b13, b23); \ - KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ - MOV64(b03, c0); \ - MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ - NOT64(bnn, b14); \ - KHI_XA(c0, b04, bnn, b24); \ - KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ - MOV64(b04, c0); \ - MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ - } while (0) - -#define IOTA(r) XOR64_IOTA(a00, a00, r) - -#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ - a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 -#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ - a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 -#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ - a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 -#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ - a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 -#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ - a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 -#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ - a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 -#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ - a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 -#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ - a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 -#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ - a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 -#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ - a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 -#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ - a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 -#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ - a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 -#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ - a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 -#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ - a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 -#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ - a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 -#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ - a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 -#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ - a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 -#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ - a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 -#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ - a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 -#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ - a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 -#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ - a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 -#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ - a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 -#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ - a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 -#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ - a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 - -#define P8_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a11); \ - MOV64(a11, a43); \ - MOV64(a43, t); \ - MOV64(t, a02); \ - MOV64(a02, a22); \ - MOV64(a22, a31); \ - MOV64(a31, t); \ - MOV64(t, a03); \ - MOV64(a03, a33); \ - MOV64(a33, a24); \ - MOV64(a24, t); \ - MOV64(t, a04); \ - MOV64(a04, a44); \ - MOV64(a44, a12); \ - MOV64(a12, t); \ - MOV64(t, a10); \ - MOV64(a10, a32); \ - MOV64(a32, a13); \ - MOV64(a13, t); \ - MOV64(t, a14); \ - MOV64(a14, a21); \ - MOV64(a21, a20); \ - MOV64(a20, t); \ - MOV64(t, a23); \ - MOV64(a23, a42); \ - MOV64(a42, a40); \ - MOV64(a40, t); \ - MOV64(t, a30); \ - MOV64(a30, a41); \ - MOV64(a41, a34); \ - MOV64(a34, t); \ - } while (0) - -#define LPAR ( -#define RPAR ) - -#define KF_ELT(r, s, k) do { \ - THETA LPAR P ## r RPAR; \ - RHO LPAR P ## r RPAR; \ - KHI LPAR P ## s RPAR; \ - IOTA(k); \ - } while (0) - -#define DO(x) x - -#define KECCAK_F_1600 DO(KECCAK_F_1600_) - -#define KECCAK_F_1600_ do { \ +#define KECCAK_F_1600_256 do { \ int j; \ for (j = 0; j < 24; j += 8) \ { \ - KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \ - RC[j + 0], RC[j + 0])) ); \ - KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \ - RC[j + 1], RC[j + 1])) ); \ - KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \ - RC[j + 2], RC[j + 2])) ); \ - KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \ - RC[j + 3], RC[j + 3])) ); \ - KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \ - RC[j + 4], RC[j + 4])) ); \ - KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \ - RC[j + 5], RC[j + 5])) ); \ - KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \ - RC[j + 6], RC[j + 6])) ); \ - KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \ - RC[j + 7], RC[j + 7])) ); \ + KF_ELT( 0, 1, _mm256_set1_epi64x( RC[j + 0] ) ); \ + KF_ELT( 1, 2, _mm256_set1_epi64x( RC[j + 1] ) ); \ + KF_ELT( 2, 3, _mm256_set1_epi64x( RC[j + 2] ) ); \ + KF_ELT( 3, 4, _mm256_set1_epi64x( RC[j + 3] ) ); \ + KF_ELT( 4, 5, _mm256_set1_epi64x( RC[j + 4] ) ); \ + KF_ELT( 5, 6, _mm256_set1_epi64x( RC[j + 5] ) ); \ + KF_ELT( 6, 7, _mm256_set1_epi64x( RC[j + 6] ) ); \ + KF_ELT( 7, 8, _mm256_set1_epi64x( RC[j + 7] ) ); \ P8_TO_P0; \ } \ } while (0) @@ -453,7 +359,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, else { j = lim - kc->ptr; - u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb ); + u.tmp[0] = m256_const1_64( eb ); memset_zero_256( u.tmp + 1, (j>>3) - 2 ); u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 ); } @@ -474,7 +380,7 @@ void keccak256_4way_init( void *kc ) } void -keccak256_4way(void *cc, const void *data, size_t len) +keccak256_4way_update(void *cc, const void *data, size_t len) { keccak64_core(cc, data, len, 136); } @@ -491,15 +397,24 @@ void keccak512_4way_init( void *kc ) } void -keccak512_4way(void *cc, const void *data, size_t len) +keccak512_4way_update(void *cc, const void *data, size_t len) { - keccak64_core(cc, data, len, 72); + keccak64_core(cc, data, len, 72); } void keccak512_4way_close(void *cc, void *dst) { - keccak64_close(cc, dst, 64, 72); + keccak64_close(cc, dst, 64, 72); } -#endif +#undef INPUT_BUF +#undef DECL64 +#undef XOR64 +#undef AND64 +#undef OR64 +#undef NOT64 +#undef ROL64 +#undef KECCAK_F_1600 + +#endif // AVX2 diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index 8f6d6a3..d8500a6 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -64,26 +64,49 @@ extern "C"{ * memcpy()). */ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { - __m256i buf[144*8]; /* first field, for alignment */ + __m512i buf[144*8]; + __m512i w[25]; + size_t ptr, lim; +} keccak64_ctx_m512i __attribute__((aligned(128))); + +typedef keccak64_ctx_m512i keccak256_8way_context; +typedef keccak64_ctx_m512i keccak512_8way_context; + +void keccak256_8way_init(void *cc); +void keccak256_8way_update(void *cc, const void *data, size_t len); +void keccak256_8way_close(void *cc, void *dst); + +void keccak512_8way_init(void *cc); +void keccak512_8way_update(void *cc, const void *data, size_t len); +void keccak512_8way_close(void *cc, void *dst); +void keccak512_8way_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +typedef struct { + __m256i buf[144*8]; __m256i w[25]; size_t ptr, lim; -// sph_u64 wide[25]; -} keccak64_ctx_m256i; +} keccak64_ctx_m256i __attribute__((aligned(128))); typedef keccak64_ctx_m256i keccak256_4way_context; typedef keccak64_ctx_m256i keccak512_4way_context; void keccak256_4way_init(void *cc); -void keccak256_4way(void *cc, const void *data, size_t len); +void keccak256_4way_update(void *cc, const void *data, size_t len); void keccak256_4way_close(void *cc, void *dst); - +#define keccak256_4way keccak256_4way_update void keccak512_4way_init(void *cc); -void keccak512_4way(void *cc, const void *data, size_t len); +void keccak512_4way_update(void *cc, const void *data, size_t len); void keccak512_4way_close(void *cc, void *dst); void keccak512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +#define keccak512_4way keccak512_4way_update #endif diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c new file mode 100644 index 0000000..64606c3 --- /dev/null +++ b/algo/keccak/keccak-macros.c @@ -0,0 +1,324 @@ +#ifdef TH_ELT +#undef TH_ELT +#endif +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#ifdef THETA +#undef THETA +#endif +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#ifdef RHO +#undef RHO +#endif +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#ifdef KHI_XO +#undef KHI_XO +#endif +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#ifdef KHI_XA +#undef KHI_XA +#endif +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#ifdef KHI +#undef KHI +#endif +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(c2); \ + DECL64(c3); \ + DECL64(c4); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(c2, b20, b30, b40); \ + KHI_XO(c3, b30, b40, b00); \ + KHI_XA(c4, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + MOV64(b20, c2); \ + MOV64(b30, c3); \ + MOV64(b40, c4); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(c2, b21, b31, bnn); \ + KHI_XO(c3, b31, b41, b01); \ + KHI_XA(c4, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + MOV64(b21, c2); \ + MOV64(b31, c3); \ + MOV64(b41, c4); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(c2, b22, bnn, b42); \ + KHI_XO(c3, bnn, b42, b02); \ + KHI_XA(c4, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + MOV64(b22, c2); \ + MOV64(b32, c3); \ + MOV64(b42, c4); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(c2, b23, bnn, b43); \ + KHI_XA(c3, bnn, b43, b03); \ + KHI_XO(c4, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + MOV64(b23, c2); \ + MOV64(b33, c3); \ + MOV64(b43, c4); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(c2, b24, b34, b44); \ + KHI_XO(c3, b34, b44, b04); \ + KHI_XA(c4, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + MOV64(b24, c2); \ + MOV64(b34, c3); \ + MOV64(b44, c4); \ + } while (0) + +#ifdef IOTA +#undef IOTA +#endif +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#ifdef P0 +#undef P1 +#undef P2 +#undef P3 +#undef P4 +#undef P5 +#undef P6 +#undef P7 +#undef P8 +#undef P9 +#undef P10 +#undef p11 +#undef P12 +#undef P13 +#undef P14 +#undef P15 +#undef P16 +#undef P17 +#undef P18 +#undef P19 +#undef P20 +#undef P21 +#undef P22 +#undef P23 +#endif + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#ifdef P8_TO_P0 +#undef P8_TO_P0 +#endif +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + + diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index cd59afb..467a1a6 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -5,7 +5,6 @@ #include "algo/bmw/bmw-hash-4way.h" #include "algo/cubehash/cubehash_sse2.h" - #if defined (LYRA2REV3_8WAY) typedef struct { @@ -14,7 +13,7 @@ typedef struct { bmw256_8way_context bmw; } lyra2v3_8way_ctx_holder; -static lyra2v3_8way_ctx_holder l2v3_8way_ctx; +static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx; bool init_lyra2rev3_8way_ctx() { @@ -38,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input ) lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) ); - blake256_8way( &ctx.blake, input, 80 ); + blake256_8way( &ctx.blake, input + (64*8), 16 ); blake256_8way_close( &ctx.blake, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, @@ -91,7 +90,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, { uint32_t hash[8*8] __attribute__ ((aligned (64))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<3]); + uint32_t *hash7 = &hash[7<<3]; uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -99,12 +98,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, uint32_t n = first_nonce; const uint32_t Htarg = ptarget[7]; __m256i *noncev = (__m256i*)vdata + 19; // aligned - const int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; + if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; mm256_bswap32_intrlv80_8x32( vdata, pdata ); + + blake256_8way_init( &l2v3_8way_ctx.blake ); + blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 ); + do { *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, @@ -119,8 +121,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, extr_lane_8x32( lane_hash, hash, lane, 256 ); if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); } } n += 8; @@ -133,14 +135,14 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, #if defined (LYRA2REV3_4WAY) - typedef struct { blake256_4way_context blake; cubehashParam cube; bmw256_4way_context bmw; } lyra2v3_4way_ctx_holder; -static lyra2v3_4way_ctx_holder l2v3_4way_ctx; +//static lyra2v3_4way_ctx_holder l2v3_4way_ctx; +static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx; bool init_lyra2rev3_4way_ctx() { @@ -160,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input ) lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) ); - blake256_4way( &ctx.blake, input, 80 ); +// blake256_4way( &ctx.blake, input, 80 ); + blake256_4way( &ctx.blake, input + (64*4), 16 ); blake256_4way_close( &ctx.blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); @@ -206,6 +209,10 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, ( (uint32_t*)ptarget )[7] = 0x0000ff; mm128_bswap32_intrlv80_4x32( vdata, pdata ); + + blake256_4way_init( &l2v3_4way_ctx.blake ); + blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 ); + do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); diff --git a/algo/sha/sha256_hash_11way.c b/algo/sha/sha256_hash_11way.c deleted file mode 100644 index df06375..0000000 --- a/algo/sha/sha256_hash_11way.c +++ /dev/null @@ -1,538 +0,0 @@ -#if 0 - -#include -#include - -#include "sha2-hash-4way.h" - -#if defined(__AVX2__) - -// naming convention for variables and macros -// VARx: AVX2 8 way 32 bit -// VARy: MMX 2 way 32 bit -// VARz: scalar integer 32 bit - - -static const uint32_t H256[8] = -{ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 -}; - -static const uint32_t K256[64] = -{ - 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, - 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, - 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, - 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, - 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, - 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, - 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, - 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, - 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, - 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, - 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, - 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, - 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, - 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, - 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, - 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 -}; - -#define CHx(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define CHy(X, Y, Z) \ - _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z ) - -#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) ) - - -#define MAJx(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - -#define MAJy(X, Y, Z) \ - _mm_or_si64( _mm_and_si64( X, Y ), \ - _mm_and_si64( _mm_or_si64( X, Y ), Z ) ) - -#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) ) - -#define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) ) - -#define BSG2_0y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) ) - -#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) ) - -#define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) ) - -#define BSG2_1y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) ) - -#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) ) - -#define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) - -#define SSG2_0y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) ) - -#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) ) - -#define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) ) - -#define SSG2_1y(x) \ - _mm_xor_si64( _mm_xor_si64( \ - mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) ) - -#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) ) - -#define SHA2x_MEXP( a, b, c, d ) \ - _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \ - SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] ) - -#define SHA2y_MEXP( a, b, c, d ) \ - _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ - SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] ) - -#define SHA2z_MEXP( a, b, c, d ) \ - ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] ) - - -#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \ - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \ - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \ -do { \ - __m256i T1x, T2x; \ - __m64 T1y, T2y; \ - uint32_t T1z, T2z; \ - T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \ - _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \ - _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \ - T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ - _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \ - _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \ - T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \ - T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \ - T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \ - T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \ - Dx = _mm256_add_epi32( Dx, T1x ); \ - Dy = _mm_add_pi32( Dy, T1y ); \ - Dz = Dz + T1z; \ - Hx = _mm256_add_epi32( T1x, T2x ); \ - Hy = _mm_add_pi32( T1y, T2y ); \ - Hz = T1z + T2z; \ -} while (0) - -void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8], - uint32_t *inz, uint32_t rz[8] ) -{ - __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx; - __m256i Wx[16]; - __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy; - __m64 Wy[16]; - uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz; - uint32_t Wz[16]; - - Wx[ 0] = mm256_bswap_32( inx[ 0] ); - Wy[ 0] = mm64_bswap_32( iny[ 0] ); - Wz[ 0] = bswap_32( inz[ 0] ); - - Wx[ 1] = mm256_bswap_32( inx[ 1] ); - Wy[ 1] = mm64_bswap_32( iny[ 1] ); - Wz[ 1] = bswap_32( inz[ 1] ); - - Wx[ 2] = mm256_bswap_32( inx[ 2] ); - Wy[ 2] = mm64_bswap_32( iny[ 2] ); - Wz[ 2] = bswap_32( inz[ 2] ); - - Wx[ 3] = mm256_bswap_32( inx[ 3] ); - Wy[ 3] = mm64_bswap_32( iny[ 3] ); - Wz[ 3] = bswap_32( inz[ 3] ); - - Wx[ 4] = mm256_bswap_32( inx[ 4] ); - Wy[ 4] = mm64_bswap_32( iny[ 4] ); - Wz[ 4] = bswap_32( inz[ 4] ); - - Wx[ 5] = mm256_bswap_32( inx[ 5] ); - Wy[ 5] = mm64_bswap_32( iny[ 5] ); - Wz[ 5] = bswap_32( inz[ 5] ); - - Wx[ 6] = mm256_bswap_32( inx[ 6] ); - Wy[ 6] = mm64_bswap_32( iny[ 6] ); - Wz[ 6] = bswap_32( inz[ 6] ); - - Wx[ 7] = mm256_bswap_32( inx[ 7] ); - Wy[ 7] = mm64_bswap_32( iny[ 7] ); - Wz[ 7] = bswap_32( inz[ 7] ); - - Wx[ 8] = mm256_bswap_32( inx[ 8] ); - Wy[ 8] = mm64_bswap_32( iny[ 8] ); - Wz[ 8] = bswap_32( inz[ 8] ); - - Wx[ 9] = mm256_bswap_32( inx[ 9] ); - Wy[ 9] = mm64_bswap_32( iny[ 9] ); - Wz[ 9] = bswap_32( inz[ 9] ); - - Wx[10] = mm256_bswap_32( inx[10] ); - Wy[10] = mm64_bswap_32( iny[10] ); - Wz[10] = bswap_32( inz[10] ); - - Wx[11] = mm256_bswap_32( inx[11] ); - Wy[11] = mm64_bswap_32( iny[11] ); - Wz[11] = bswap_32( inz[11] ); - - Wx[12] = mm256_bswap_32( inx[12] ); - Wy[12] = mm64_bswap_32( iny[12] ); - Wz[12] = bswap_32( inz[12] ); - - Wx[13] = mm256_bswap_32( inx[13] ); - Wy[13] = mm64_bswap_32( iny[13] ); - Wz[13] = bswap_32( inz[13] ); - - Wx[14] = mm256_bswap_32( inx[14] ); - Wy[14] = mm64_bswap_32( iny[14] ); - Wz[14] = bswap_32( inz[14] ); - - Wx[15] = mm256_bswap_32( inx[15] ); - Wy[15] = mm64_bswap_32( iny[15] ); - Wz[15] = bswap_32( inz[15] ); - - Ax = rx[0]; Ay = ry[0]; Az = rz[0]; - Bx = rx[1]; By = ry[1]; Bz = rz[1]; - Cx = rx[2]; Cy = ry[2]; Cz = rz[2]; - Dx = rx[3]; Dy = ry[3]; Dz = rz[3]; - Ex = rx[4]; Ey = ry[4]; Ez = rz[4]; - Fx = rx[5]; Fy = ry[5]; Fz = rz[5]; - Gx = rx[6]; Gy = ry[6]; Gz = rz[6]; - Hx = rx[7]; Hy = ry[7]; Hz = rz[7]; - - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 ); - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 ); - Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 ); - - Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 ); - Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 ); - - Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 ); - Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 ); - - Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 ); - Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 ); - - Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 ); - Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 ); - - Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 ); - Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 ); - - Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 ); - Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 ); - - Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7); - Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7); - Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7); - - Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8); - Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8); - Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8); - - Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9); - Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9); - - Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - Wy[10] = SHA2y_MEXP( 8, 3, 11, 10); - Wz[10] = SHA2z_MEXP( 8, 3, 11, 10); - - Wx[11] = SHA2x_MEXP( 9, 4, 12, 11); - Wy[11] = SHA2y_MEXP( 9, 4, 12, 11); - Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 ); - - Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 ); - Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 ); - - Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 ); - Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 ); - - Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 ); - Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 ); - - Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 ); - Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 ); - - - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j ); - SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, - Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, - Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j ); - SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, - Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, - Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j ); - SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, - Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, - Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j ); - SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, - Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, - Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j ); - SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, - Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, - Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j ); - SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, - Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, - Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j ); - SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, - Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, - Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j ); - SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, - By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, - Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j ); - } - - rx[0] = _mm256_add_epi32( rx[0], Ax ); - ry[0] = _mm_add_pi32( ry[0], Ay ); - rz[0] = rz[0]+ Az; - rx[1] = _mm256_add_epi32( rx[1], Bx ); - ry[1] = _mm_add_pi32( ry[1], By ); - rz[1] = rz[1]+ Bz; - rx[2] = _mm256_add_epi32( rx[2], Cx ); - ry[2] = _mm_add_pi32( ry[2], Cy ); - rz[3] = rz[3]+ Dz; - rx[4] = _mm256_add_epi32( rx[4], Ex ); - ry[4] = _mm_add_pi32( ry[4], Ey ); - rz[4] = rz[4]+ Ez; - rx[5] = _mm256_add_epi32( rx[5], Fx ); - ry[5] = _mm_add_pi32( ry[5], Fy ); - rz[5] = rz[5]+ Fz; - rx[6] = _mm256_add_epi32( rx[6], Gx ); - ry[6] = _mm_add_pi32( ry[6], Gy ); - rz[6] = rz[6]+ Gz; - rx[7] = _mm256_add_epi32( rx[7], Hx ); - ry[7] = _mm_add_pi32( ry[7], Hy ); - rz[7] = rz[7]+ Hz; - -} - -void sha256_11way_init( sha256_11way_context *ctx ) -{ - ctx->count_high = ctx->count_low = 0; - ctx->valx[0] = _mm256_set1_epi32( H256[0] ); - ctx->valy[0] = _mm_set1_pi32( H256[0] ); - ctx->valx[1] = _mm256_set1_epi32( H256[0] ); - ctx->valy[1] = _mm_set1_pi32( H256[0] ); - ctx->valx[2] = _mm256_set1_epi32( H256[0] ); - ctx->valy[2] = _mm_set1_pi32( H256[0] ); - ctx->valx[3] = _mm256_set1_epi32( H256[0] ); - ctx->valy[3] = _mm_set1_pi32( H256[0] ); - ctx->valx[4] = _mm256_set1_epi32( H256[0] ); - ctx->valy[4] = _mm_set1_pi32( H256[0] ); - ctx->valx[5] = _mm256_set1_epi32( H256[0] ); - ctx->valy[5] = _mm_set1_pi32( H256[0] ); - ctx->valx[6] = _mm256_set1_epi32( H256[0] ); - ctx->valy[6] = _mm_set1_pi32( H256[0] ); - ctx->valx[7] = _mm256_set1_epi32( H256[0] ); - ctx->valy[7] = _mm_set1_pi32( H256[0] ); - memcpy( ctx->valz, H256, 32 ); -} - - -void sha256_11way_update( sha256_11way_context *ctx, const void *datax, - const void *datay, const void *dataz, size_t len ) -{ - __m256i *vdatax = (__m256i*) datax; - __m64 *vdatay = (__m64*) datay; - uint32_t *idataz = (uint32_t*)dataz; - size_t ptr; - const int buf_size = 64; - - ptr = (unsigned)ctx->count_low & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - uint32_t clow, clow2; - - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 ); - memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 ); - memcpy ( ctx->bufz + ptr, idataz + ptr, clen ); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - ptr = 0; - } - clow = ctx->count_low; - clow2 = clow + clen; - ctx->count_low = clow2; - if ( clow2 < clow ) - ctx->count_high++; - } -} - - -void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty, - void *dstz) -{ - unsigned ptr, u; - uint32_t low, high; - const int buf_size = 64; - const int pad = buf_size - 8; - - ptr = (unsigned)ctx->count_low & (buf_size - 1U); - ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); - ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 ); - ctx->bufz[ ptr>>2 ] = 0x80; - ptr += 4; - - if ( ptr > pad ) - { - memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 ); - memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 ); - memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 ); - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - memset_zero_256( ctx->bufx, pad >> 2 ); - memset_zero_m64( ctx->bufy, pad >> 2 ); - memset( ctx->bufz, 0, pad >> 2 ); - } - else - { - memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 ); - memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 ); - memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 ); - } - - low = ctx->count_low; - high = (ctx->count_high << 3) | (low >> 29); - low = low << 3; - - ctx->bufx[ pad >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( high ) ); - ctx->bufy[ pad >> 2 ] = - mm64_bswap_32( _mm_set1_pi32( high ) ); - ctx->bufz[ pad >> 2 ] = - bswap_32( high ); - - - ctx->bufx[ ( pad+4 ) >> 2 ] = - mm256_bswap_32( _mm256_set1_epi32( low ) ); - ctx->bufy[ ( pad+4 ) >> 2 ] = - mm64_bswap_32( _mm_set1_pi32( low ) ); - ctx->bufz[ ( pad+4 ) >> 2 ] = - bswap_32( low ); - - sha256_11way_round( ctx->bufx, ctx->valx, - ctx->bufy, ctx->valy, - ctx->bufz, ctx->valz ); - - for ( u = 0; u < 8; u ++ ) - { - casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] ); - casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] ); - ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] ); - } -} - -#endif -#endif // 0 diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 7078d19..5c4dd68 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -5,137 +5,6 @@ #include #include "sha-hash-4way.h" -#if defined(SHA256T_11WAY) - -static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64))); - -void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx, - const void *inpy, const void*inpz ) -{ - uint32_t hashx[8*8] __attribute__ ((aligned (64))); - uint32_t hashy[8*2] __attribute__ ((aligned (64))); - uint32_t hashz[8] __attribute__ ((aligned (64))); - sha256_11way_context ctx; - const void *inpx64 = inpx+(64<<3); - const void *inpy64 = inpy+(64<<1); - const void *inpz64 = inpz+ 64; - - memcpy( &ctx, &sha256_ctx11, sizeof ctx ); - sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 ); - sha256_11way_close( &ctx, hashx, hashy, hashz ); - - sha256_11way_init( &ctx ); - sha256_11way_update( &ctx, hashx, hashy, hashz, 32 ); - sha256_11way_close( &ctx, hashx, hashy, hashz ); - - sha256_11way_init( &ctx ); - sha256_11way_update( &ctx, hashx, hashy, hashz, 32 ); - sha256_11way_close( &ctx, outx, outy, outz ); -} - -int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t datax[20*8] __attribute__ ((aligned (64))); - uint32_t datay[20*2] __attribute__ ((aligned (32))); - uint32_t dataz[20] __attribute__ ((aligned (32))); - uint32_t hashx[8*8] __attribute__ ((aligned (32))); - uint32_t hashy[8*2] __attribute__ ((aligned (32))); - uint32_t hashz[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - __m256i *noncex = (__m256i*) datax + 19; - __m64 *noncey = (__m64*) datay + 19; - uint32_t *noncez = (uint32_t*)dataz + 19; - int thr_id = mythr->id; // thr_id arg is deprecated - int i; - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; - - // Use dataz (scalar) to stage bswapped data for the vectors. - casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); - casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); - casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - - intrlv_8x32( datax, dataz, dataz, dataz, dataz, - dataz, dataz, dataz, dataz, 640 ); - mm64_interleave_2x32( datay, dataz, dataz, 640 ); - - sha256_11way_init( &sha256_ctx11 ); - sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 ); - - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncex = mm256_bswap_32( - _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); - *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) ); - *noncez = bswap_32( n+10 ); - - pdata[19] = n; - - sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz ); - - if ( opt_benchmark ) { n += 11; continue; } - - hash7 = &(hashx[7<<3]); - for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) ) - { - // deinterleave hash for lane - extr_lane_8x32( lane_hash, hashx, i, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + i; - submit_lane_solution( work, lane_hash, mythr, i ); - } - } - - hash7 = &(hashy[7<<1]); - for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) ) - - { - mm64_extr_lane_2x32( lane_hash, hashy, i, 256 ); - if ( fulltest( lane_hash, ptarget ) ) - { - pdata[19] = n + 8 + i; - submit_lane_solution( work, lane_hash, mythr, i+8 ); - } - } - - if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) ) - { - pdata[19] = n+10; - submit_lane_solution( work, hashz, mythr, 10 ); - } - n += 11; - - } while ( (n < max_nonce-12) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; - return 0; -} - -#endif - #if defined(SHA256T_8WAY) static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); diff --git a/algo/simd/nist.c b/algo/simd/nist.c index 73b5131..17b86a6 100644 --- a/algo/simd/nist.c +++ b/algo/simd/nist.c @@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) { char *init; #ifndef NO_PRECOMPUTED_IV - if (hashbitlen == 224) - r=InitIV(state, hashbitlen, IV_224); - else if (hashbitlen == 256) - r=InitIV(state, hashbitlen, IV_256); - else if (hashbitlen == 384) - r=InitIV(state, hashbitlen, IV_384); - else if (hashbitlen == 512) +// if (hashbitlen == 224) +// r=InitIV(state, hashbitlen, IV_224); +// else if (hashbitlen == 256) +// r=InitIV(state, hashbitlen, IV_256); +// else if (hashbitlen == 384) +// r=InitIV(state, hashbitlen, IV_384); +// else + if (hashbitlen == 512) r=InitIV(state, hashbitlen, IV_512); else #endif diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 447cb3a..5dba6d2 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -2,13 +2,136 @@ #include #include #include "skein-hash-4way.h" + +// 8 way is faster than SHA on Icelake +// SHA is faster than 4 way on Ryzen +// #if defined(__SHA__) #include -#else - #include "algo/sha/sha-hash-4way.h" #endif +#include "algo/sha/sha-hash-4way.h" -#if defined (SKEIN_4WAY) +#if defined (SKEIN_8WAY) + +void skeinhash_8way( void *state, const void *input ) +{ + uint64_t vhash64[16*8] __attribute__ ((aligned (128))); + skein512_8way_context ctx_skein; + +//#if defined(__SHA__) +// uint32_t hash0[16] __attribute__ ((aligned (64))); +// uint32_t hash1[16] __attribute__ ((aligned (64))); +// uint32_t hash2[16] __attribute__ ((aligned (64))); +// uint32_t hash3[16] __attribute__ ((aligned (64))); +// uint32_t hash4[16] __attribute__ ((aligned (64))); +// uint32_t hash5[16] __attribute__ ((aligned (64))); +// uint32_t hash6[16] __attribute__ ((aligned (64))); +// uint32_t hash7[16] __attribute__ ((aligned (64))); +// SHA256_CTX ctx_sha256; +//#else + uint32_t vhash32[32*8] __attribute__ ((aligned (128))); + sha256_8way_context ctx_sha256; +//#endif + + skein512_8way_init( &ctx_skein ); + skein512_8way_update( &ctx_skein, input, 80 ); + skein512_8way_close( &ctx_skein, vhash64 ); +/* +#if defined(__SHA__) + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash64, 512 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 ); + SHA256_Final( (unsigned char*)hash0, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 ); + SHA256_Final( (unsigned char*)hash1, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 ); + SHA256_Final( (unsigned char*)hash2, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 ); + SHA256_Final( (unsigned char*)hash3, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 ); + SHA256_Final( (unsigned char*)hash4, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 ); + SHA256_Final( (unsigned char*)hash5, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 ); + SHA256_Final( (unsigned char*)hash6, &ctx_sha256 ); + + SHA256_Init( &ctx_sha256 ); + SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 ); + SHA256_Final( (unsigned char*)hash7, &ctx_sha256 ); + + intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, 256 ); +#else +*/ + + rintrlv_8x64_8x32( vhash32, vhash64, 512 ); +// dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, +// vhash64, 512 ); +// intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6, +// hash7, 512 ); + + sha256_8way_init( &ctx_sha256 ); + sha256_8way( &ctx_sha256, vhash32, 64 ); + sha256_8way_close( &ctx_sha256, state ); +//#endif +} + +int scanhash_skein_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t vdata[20*8] __attribute__ ((aligned (128))); + uint32_t hash[16*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + skeinhash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane ] <= Htarg ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (SKEIN_4WAY) void skeinhash_4way( void *state, const void *input ) { @@ -26,7 +149,7 @@ void skeinhash_4way( void *state, const void *input ) #endif skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, input, 80 ); + skein512_4way_update( &ctx_skein, input, 80 ); skein512_4way_close( &ctx_skein, vhash64 ); #if defined(__SHA__) @@ -71,7 +194,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; mm256_bswap32_intrlv80_4x64( vdata, pdata ); do @@ -92,9 +215,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce, } } n += 4; - } while ( (n < max_nonce) && !work_restart[thr_id].restart ); + } while ( (n < max_nonce-4) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index 6acdc19..4e5071b 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -4,8 +4,11 @@ bool register_skein_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT | SHA_OPT; -#if defined (SKEIN_4WAY) + gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; +#if defined (SKEIN_8WAY) + gate->scanhash = (void*)&scanhash_skein_8way; + gate->hash = (void*)&skeinhash_8way; +#elif defined (SKEIN_4WAY) gate->scanhash = (void*)&scanhash_skein_4way; gate->hash = (void*)&skeinhash_4way; #else @@ -15,3 +18,20 @@ bool register_skein_algo( algo_gate_t* gate ) return true; }; +bool register_skein2_algo( algo_gate_t* gate ) +{ + gate->optimizations = AVX2_OPT | AVX512_OPT; +#if defined (SKEIN_8WAY) + gate->scanhash = (void*)&scanhash_skein2_8way; + gate->hash = (void*)&skein2hash_8way; +#elif defined (SKEIN_4WAY) + gate->scanhash = (void*)&scanhash_skein2_4way; + gate->hash = (void*)&skein2hash_4way; +#else + gate->scanhash = (void*)&scanhash_skein2; + gate->hash = (void*)&skein2hash; +#endif + return true; +}; + + diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index ac7f281..eba535e 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -1,23 +1,44 @@ #ifndef __SKEIN_GATE_H__ -#define __SKEIN_GATE_H__ +#define __SKEIN_GATE_H__ 1 #include #include "algo-gate-api.h" -#if defined(__AVX2__) - #define SKEIN_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define SKEIN_8WAY 1 +#elif defined(__AVX2__) + #define SKEIN_4WAY 1 #endif -#if defined(SKEIN_4WAY) +#if defined(SKEIN_8WAY) + +void skeinhash_8way( void *output, const void *input ); +int scanhash_skein_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +void skein2hash_8way( void *output, const void *input ); +int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, + uint64_t* hashes_done, struct thr_info *mythr ); + +#elif defined(SKEIN_4WAY) void skeinhash_4way( void *output, const void *input ); - int scanhash_skein_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#endif + +void skein2hash_4way( void *output, const void *input ); +int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, + uint64_t* hashes_done, struct thr_info *mythr ); + +#else void skeinhash( void *output, const void *input ); - int scanhash_skein( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); +void skein2hash( void *output, const void *input ); +int scanhash_skein2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + #endif diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index a070ca2..7759d39 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -36,7 +36,6 @@ #include #include "skein-hash-4way.h" - #ifdef __cplusplus extern "C"{ #endif @@ -45,6 +44,22 @@ extern "C"{ #pragma warning (disable: 4146) #endif +/* +static const sph_u64 IV256[] = { + SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), + SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), + SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), + SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; +*/ + /* * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). */ @@ -270,8 +285,151 @@ extern "C"{ #define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) #define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) +#define READ_STATE_BIG(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + h4 = (sc)->h4; \ + h5 = (sc)->h5; \ + h6 = (sc)->h6; \ + h7 = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + (sc)->h4 = h4; \ + (sc)->h5 = h5; \ + (sc)->h6 = h6; \ + (sc)->h7 = h7; \ + sc->bcount = bcount; \ + } while (0) + // AVX2 all scalar vars are now vectors representing 4 nonces in parallel + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ +do { \ + k8 = _mm512_xor_si512( _mm512_xor_si512( \ + _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \ + _mm512_xor_si512( k2, k3 ) ), \ + _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \ + _mm512_xor_si512( k6, k7 ) ) ), \ + m512_const1_64( 0x1BD11BDAA9FC1A22) ); \ + t2 = t0 ^ t1; \ +} while (0) + +#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \ +do { \ + w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \ + w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \ + w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \ + w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \ + w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \ + w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \ + m512_const1_64( SKBT(t,s,0) ) ) ); \ + w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \ + m512_const1_64( SKBT(t,s,1) ) ) ); \ + w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \ + m512_const1_64( s ) ) ); \ +} while (0) + + +#define TFBIG_MIX_8WAY(x0, x1, rc) \ +do { \ + x0 = _mm512_add_epi64( x0, x1 ); \ + x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \ +} while (0) + +#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ + TFBIG_MIX_8WAY(w0, w1, rc0); \ + TFBIG_MIX_8WAY(w2, w3, rc1); \ + TFBIG_MIX_8WAY(w4, w5, rc2); \ + TFBIG_MIX_8WAY(w6, w7, rc3); \ + } while (0) + +#define TFBIG_8WAY_4e(s) do { \ + TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_8WAY_4o(s) do { \ + TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#define UBI_BIG_8WAY(etype, extra) \ +do { \ + sph_u64 t0, t1, t2; \ + __m512i h8; \ + __m512i m0 = buf[0]; \ + __m512i m1 = buf[1]; \ + __m512i m2 = buf[2]; \ + __m512i m3 = buf[3]; \ + __m512i m4 = buf[4]; \ + __m512i m5 = buf[5]; \ + __m512i m6 = buf[6]; \ + __m512i m7 = buf[7]; \ +\ + __m512i p0 = m0; \ + __m512i p1 = m1; \ + __m512i p2 = m2; \ + __m512i p3 = m3; \ + __m512i p4 = m4; \ + __m512i p5 = m5; \ + __m512i p6 = m6; \ + __m512i p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ + TFBIG_8WAY_4e(0); \ + TFBIG_8WAY_4o(1); \ + TFBIG_8WAY_4e(2); \ + TFBIG_8WAY_4o(3); \ + TFBIG_8WAY_4e(4); \ + TFBIG_8WAY_4o(5); \ + TFBIG_8WAY_4e(6); \ + TFBIG_8WAY_4o(7); \ + TFBIG_8WAY_4e(8); \ + TFBIG_8WAY_4o(9); \ + TFBIG_8WAY_4e(10); \ + TFBIG_8WAY_4o(11); \ + TFBIG_8WAY_4e(12); \ + TFBIG_8WAY_4o(13); \ + TFBIG_8WAY_4e(14); \ + TFBIG_8WAY_4o(15); \ + TFBIG_8WAY_4e(16); \ + TFBIG_8WAY_4o(17); \ + TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ + h0 = _mm512_xor_si512( m0, p0 );\ + h1 = _mm512_xor_si512( m1, p1 );\ + h2 = _mm512_xor_si512( m2, p2 );\ + h3 = _mm512_xor_si512( m3, p3 );\ + h4 = _mm512_xor_si512( m4, p4 );\ + h5 = _mm512_xor_si512( m5, p5 );\ + h6 = _mm512_xor_si512( m6, p6 );\ + h7 = _mm512_xor_si512( m7, p7 );\ +} while (0) + +#define DECL_STATE_BIG_8WAY \ + __m512i h0, h1, h2, h3, h4, h5, h6, h7; \ + sph_u64 bcount; + + +#endif // AVX512 + #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ do { \ k8 = _mm256_xor_si256( _mm256_xor_si256( \ @@ -298,39 +456,34 @@ do { \ m256_const1_64( s ) ) ); \ } while (0) - #define TFBIG_MIX_4WAY(x0, x1, rc) \ do { \ x0 = _mm256_add_epi64( x0, x1 ); \ x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \ } while (0) - -// typeless -#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ - TFBIG_MIX_4WAY(w0, w1, rc0); \ - TFBIG_MIX_4WAY(w2, w3, rc1); \ - TFBIG_MIX_4WAY(w4, w5, rc2); \ - TFBIG_MIX_4WAY(w6, w7, rc3); \ - } while (0) +#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ + TFBIG_MIX_4WAY(w0, w1, rc0); \ + TFBIG_MIX_4WAY(w2, w3, rc1); \ + TFBIG_MIX_4WAY(w4, w5, rc2); \ + TFBIG_MIX_4WAY(w6, w7, rc3); \ + } while (0) +#define TFBIG_4WAY_4e(s) do { \ + TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) -#define TFBIG_4e(s) do { \ - TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ - } while (0) - -#define TFBIG_4o(s) do { \ - TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ - } while (0) - +#define TFBIG_4WAY_4o(s) do { \ + TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) // scale buf offset by 4 #define UBI_BIG_4WAY(etype, extra) \ @@ -357,24 +510,24 @@ do { \ t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ - TFBIG_4e(0); \ - TFBIG_4o(1); \ - TFBIG_4e(2); \ - TFBIG_4o(3); \ - TFBIG_4e(4); \ - TFBIG_4o(5); \ - TFBIG_4e(6); \ - TFBIG_4o(7); \ - TFBIG_4e(8); \ - TFBIG_4o(9); \ - TFBIG_4e(10); \ - TFBIG_4o(11); \ - TFBIG_4e(12); \ - TFBIG_4o(13); \ - TFBIG_4e(14); \ - TFBIG_4o(15); \ - TFBIG_4e(16); \ - TFBIG_4o(17); \ + TFBIG_4WAY_4e(0); \ + TFBIG_4WAY_4o(1); \ + TFBIG_4WAY_4e(2); \ + TFBIG_4WAY_4o(3); \ + TFBIG_4WAY_4e(4); \ + TFBIG_4WAY_4o(5); \ + TFBIG_4WAY_4e(6); \ + TFBIG_4WAY_4o(7); \ + TFBIG_4WAY_4e(8); \ + TFBIG_4WAY_4o(9); \ + TFBIG_4WAY_4e(10); \ + TFBIG_4WAY_4o(11); \ + TFBIG_4WAY_4e(12); \ + TFBIG_4WAY_4o(13); \ + TFBIG_4WAY_4e(14); \ + TFBIG_4WAY_4o(15); \ + TFBIG_4WAY_4e(16); \ + TFBIG_4WAY_4o(17); \ TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ h0 = _mm256_xor_si256( m0, p0 );\ h1 = _mm256_xor_si256( m1, p1 );\ @@ -391,45 +544,142 @@ do { \ __m256i h0, h1, h2, h3, h4, h5, h6, h7; \ sph_u64 bcount; -#define READ_STATE_BIG(sc) do { \ - h0 = (sc)->h0; \ - h1 = (sc)->h1; \ - h2 = (sc)->h2; \ - h3 = (sc)->h3; \ - h4 = (sc)->h4; \ - h5 = (sc)->h5; \ - h6 = (sc)->h6; \ - h7 = (sc)->h7; \ - bcount = sc->bcount; \ - } while (0) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -#define WRITE_STATE_BIG(sc) do { \ - (sc)->h0 = h0; \ - (sc)->h1 = h1; \ - (sc)->h2 = h2; \ - (sc)->h3 = h3; \ - (sc)->h4 = h4; \ - (sc)->h5 = h5; \ - (sc)->h6 = h6; \ - (sc)->h7 = h7; \ - sc->bcount = bcount; \ - } while (0) +void skein256_8way_init( skein256_8way_context *sc ) +{ + sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 ); + sc->h1 = m512_const1_64( 0xE83590301A79A9EB ); + sc->h2 = m512_const1_64( 0x55AEA0614F816E6F ); + sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB ); + sc->h4 = m512_const1_64( 0xEC06025E74DD7683 ); + sc->h5 = m512_const1_64( 0xE7A436CDC4746251 ); + sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 ); + sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 ); + sc->bcount = 0; + sc->ptr = 0; +} -/* -static const sph_u64 IV256[] = { - SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), - SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), - SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), - SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) -}; +void skein512_8way_init( skein512_8way_context *sc ) +{ + sc->h0 = m512_const1_64( 0x4903ADFF749C51CE ); + sc->h1 = m512_const1_64( 0x0D95DE399746DF03 ); + sc->h2 = m512_const1_64( 0x8FD1934127C79BCE ); + sc->h3 = m512_const1_64( 0x9A255629FF352CB1 ); + sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 ); + sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 ); + sc->h6 = m512_const1_64( 0x991112C71A75B523 ); + sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 ); + sc->bcount = 0; + sc->ptr = 0; +} + +static void +skein_big_core_8way( skein512_8way_context *sc, const void *data, + size_t len ) +{ + __m512i *vdata = (__m512i*)data; + __m512i *buf; + size_t ptr; + unsigned first; + DECL_STATE_BIG_8WAY + + buf = sc->buf; + ptr = sc->ptr; + const int buf_size = 64; // 64 * _m256i + + if ( len <= buf_size - ptr ) + { + memcpy_512( buf + (ptr>>3), vdata, len>>3 ); + sc->ptr = ptr + len; + return; + } + + READ_STATE_BIG( sc ); + first = ( bcount == 0 ) << 7; + do { + size_t clen; + + if ( ptr == buf_size ) + { + bcount ++; + UBI_BIG_8WAY( 96 + first, 0 ); + first = 0; + ptr = 0; + } + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_512( buf + (ptr>>3), vdata, clen>>3 ); + ptr += clen; + vdata += (clen>>3); + len -= clen; + } while ( len > 0 ); + WRITE_STATE_BIG( sc ); + sc->ptr = ptr; +} + +static void +skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len ) +{ + __m512i *buf; + size_t ptr; + unsigned et; + DECL_STATE_BIG_8WAY + + buf = sc->buf; + ptr = sc->ptr; + const int buf_size = 64; + + READ_STATE_BIG(sc); + + memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + et = 352 + ((bcount == 0) << 7); + UBI_BIG_8WAY( et, ptr ); + + memset_zero_512( buf, buf_size >> 3 ); + bcount = 0; + UBI_BIG_8WAY( 510, 8 ); + + buf[0] = h0; + buf[1] = h1; + buf[2] = h2; + buf[3] = h3; + buf[4] = h4; + buf[5] = h5; + buf[6] = h6; + buf[7] = h7; + + memcpy_512( dst, buf, out_len >> 3 ); +} + +void +skein256_8way_update(void *cc, const void *data, size_t len) +{ + skein_big_core_8way(cc, data, len); +} + +void +skein256_8way_close(void *cc, void *dst) +{ + skein_big_close_8way(cc, 0, 0, dst, 32); +} + +void +skein512_8way_update(void *cc, const void *data, size_t len) +{ + skein_big_core_8way(cc, data, len); +} + +void +skein512_8way_close(void *cc, void *dst) +{ + skein_big_close_8way(cc, 0, 0, dst, 64); +} + +#endif // AVX512 -static const sph_u64 IV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) -}; -*/ void skein256_4way_init( skein256_4way_context *sc ) { @@ -517,66 +767,30 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, ptr = sc->ptr; const int buf_size = 64; - /* - * At that point, if ptr == 0, then the message was empty; - * otherwise, there is between 1 and 64 bytes (inclusive) which - * are yet to be processed. Either way, we complete the buffer - * to a full block with zeros (the Skein specification mandates - * that an empty message is padded so that there is at least - * one block to process). - * - * Once this block has been processed, we do it again, with - * a block full of zeros, for the output (that block contains - * the encoding of "0", over 8 bytes, then padded with zeros). - */ - READ_STATE_BIG(sc); - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); et = 352 + ((bcount == 0) << 7); - UBI_BIG_4WAY( et, ptr ); + UBI_BIG_4WAY( et, ptr ); - memset_zero_256( buf, buf_size >> 3 ); - bcount = 0; - UBI_BIG_4WAY( 510, 8 ); + memset_zero_256( buf, buf_size >> 3 ); + bcount = 0; + UBI_BIG_4WAY( 510, 8 ); - buf[0] = h0; - buf[1] = h1; - buf[2] = h2; - buf[3] = h3; - buf[4] = h4; - buf[5] = h5; - buf[6] = h6; - buf[7] = h7; + buf[0] = h0; + buf[1] = h1; + buf[2] = h2; + buf[3] = h3; + buf[4] = h4; + buf[5] = h5; + buf[6] = h6; + buf[7] = h7; - memcpy_256( dst, buf, out_len >> 3 ); + memcpy_256( dst, buf, out_len >> 3 ); } -/* -static const sph_u64 IV256[] = { - SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), - SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), - SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), - SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) -}; - -static const sph_u64 IV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) -}; -*/ -/* void -skein256_4way_init(void *cc) -{ - skein_big_init_4way(cc, IV256); -} -*/ - -void -skein256_4way(void *cc, const void *data, size_t len) +skein256_4way_update(void *cc, const void *data, size_t len) { skein_big_core_4way(cc, data, len); } @@ -587,16 +801,8 @@ skein256_4way_close(void *cc, void *dst) skein_big_close_4way(cc, 0, 0, dst, 32); } -/* void -skein512_4way_init(void *cc) -{ - skein_big_init_4way(cc, IV512); -} -*/ - -void -skein512_4way(void *cc, const void *data, size_t len) +skein512_4way_update(void *cc, const void *data, size_t len) { skein_big_core_4way(cc, data, len); } diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index 8ff9285..4f828a1 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -55,29 +55,50 @@ extern "C"{ #define SPH_SIZE_skein256 256 #define SPH_SIZE_skein512 512 + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + typedef struct { - __m256i buf[8] __attribute__ ((aligned (64))); + __m512i buf[8]; + __m512i h0, h1, h2, h3, h4, h5, h6, h7; + size_t ptr; + sph_u64 bcount; +} sph_skein_8way_big_context __attribute__ ((aligned (128))); + +typedef sph_skein_8way_big_context skein512_8way_context; +typedef sph_skein_8way_big_context skein256_8way_context; + +void skein512_8way_init( skein512_8way_context *sc ); +void skein512_8way_update( void *cc, const void *data, size_t len ); +void skein512_8way_close( void *cc, void *dst ); + +void skein256_8way_init( skein256_8way_context *sc ); +void skein256_8way_update( void *cc, const void *data, size_t len ); +void skein256_8way_close( void *cc, void *dst ); + +#endif // AVX512 + +typedef struct +{ + __m256i buf[8]; __m256i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; sph_u64 bcount; -} sph_skein_4way_big_context; +} sph_skein_4way_big_context __attribute__ ((aligned (128))); typedef sph_skein_4way_big_context skein512_4way_context; typedef sph_skein_4way_big_context skein256_4way_context; void skein512_4way_init( skein512_4way_context *sc ); -void skein512_4way( void *cc, const void *data, size_t len ); +void skein512_4way_update( void *cc, const void *data, size_t len ); void skein512_4way_close( void *cc, void *dst ); -//void sph_skein512_addbits_and_close( -// void *cc, unsigned ub, unsigned n, void *dst); +#define skein512_4way skein512_4way_update void skein256_4way_init( skein256_4way_context *sc ); -void skein256_4way( void *cc, const void *data, size_t len ); +void skein256_4way_update( void *cc, const void *data, size_t len ); void skein256_4way_close( void *cc, void *dst ); -//void sph_skein256_addbits_and_close( -// void *cc, unsigned ub, unsigned n, void *dst); - +#define skein256_4way skein256_4way_update #ifdef __cplusplus } diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index b67fa78..a51508b 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -1,9 +1,66 @@ -#include "skein2-gate.h" +#include "skein-gate.h" #include #include #include "skein-hash-4way.h" -#if defined(SKEIN2_4WAY) +#if defined(SKEIN_8WAY) + +void skein2hash_8way( void *output, const void *input ) +{ + skein512_8way_context ctx; + uint64_t hash[16*8] __attribute__ ((aligned (128))); + + skein512_8way_init( &ctx ); + skein512_8way_update( &ctx, input, 80 ); + skein512_8way_close( &ctx, hash ); + + skein512_8way_init( &ctx ); + skein512_8way_update( &ctx, hash, 64 ); + skein512_8way_close( &ctx, output ); +} + +int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + skein2hash_8way( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane<<1 ] <= Htarg ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < max_nonce-8) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce + 1; + return 0; +} + +#elif defined(SKEIN_4WAY) void skein2hash_4way( void *output, const void *input ) { diff --git a/algo/skein/skein2-gate.c b/algo/skein/skein2-gate.c deleted file mode 100644 index d40e2c4..0000000 --- a/algo/skein/skein2-gate.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "skein2-gate.h" -#include -#include "sph_skein.h" - -bool register_skein2_algo( algo_gate_t* gate ) -{ - gate->optimizations = AVX2_OPT; -#if defined (SKEIN2_4WAY) - gate->scanhash = (void*)&scanhash_skein2_4way; - gate->hash = (void*)&skein2hash_4way; -#else - gate->scanhash = (void*)&scanhash_skein2; - gate->hash = (void*)&skein2hash; -#endif - return true; -}; - diff --git a/algo/skein/skein2-gate.h b/algo/skein/skein2-gate.h deleted file mode 100644 index 5f3759b..0000000 --- a/algo/skein/skein2-gate.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __SKEIN2GATE_H__ -#define __SKEIN2_GATE_H__ -#include "algo-gate-api.h" -#include - -#if defined(__AVX2__) - #define SKEIN2_4WAY -#endif - -#if defined(SKEIN2_4WAY) -void skein2hash_4way( void *output, const void *input ); -int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, - uint64_t* hashes_done, struct thr_info *mythr ); -#endif - -void skein2hash( void *output, const void *input ); -int scanhash_skein2( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -#endif - diff --git a/algo/skein/skein2.c b/algo/skein/skein2.c index 93f3c07..a42915b 100644 --- a/algo/skein/skein2.c +++ b/algo/skein/skein2.c @@ -1,4 +1,4 @@ -#include "algo-gate-api.h" +#include "skein-gate.h" #include #include diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index ba3199c..d1f51c5 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -45,12 +45,12 @@ void init_tt8_4way_ctx() void timetravel_4way_hash(void *output, const void *input) { - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhashX[8*4] __attribute__ ((aligned (64))); - uint64_t vhashY[8*4] __attribute__ ((aligned (64))); + uint64_t hash0[10] __attribute__ ((aligned (64))); + uint64_t hash1[10] __attribute__ ((aligned (64))); + uint64_t hash2[10] __attribute__ ((aligned (64))); + uint64_t hash3[10] __attribute__ ((aligned (64))); + uint64_t vhashX[10*4] __attribute__ ((aligned (64))); + uint64_t vhashY[10*4] __attribute__ ((aligned (64))); uint64_t *vhashA, *vhashB; tt8_4way_ctx_holder ctx __attribute__ ((aligned (64))); uint32_t dataLen = 64; diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index 5dab3c8..f4c016d 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -51,12 +51,12 @@ void init_tt10_4way_ctx() void timetravel10_4way_hash(void *output, const void *input) { - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhashX[8*4] __attribute__ ((aligned (64))); - uint64_t vhashY[8*4] __attribute__ ((aligned (64))); + uint64_t hash0[10] __attribute__ ((aligned (64))); + uint64_t hash1[10] __attribute__ ((aligned (64))); + uint64_t hash2[10] __attribute__ ((aligned (64))); + uint64_t hash3[10] __attribute__ ((aligned (64))); + uint64_t vhashX[10*4] __attribute__ ((aligned (64))); + uint64_t vhashY[10*4] __attribute__ ((aligned (64))); uint64_t *vhashA, *vhashB; tt10_4way_ctx_holder ctx __attribute__ ((aligned (64))); uint32_t dataLen = 64; diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c index 80fae6d..90ed730 100644 --- a/algo/x12/x12-4way.c +++ b/algo/x12/x12-4way.c @@ -108,7 +108,7 @@ void x12_4way_hash( void *state, const void *input ) intrlv_2x128( vhash, hash2, hash3, 512 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); - intrlv_2x128( hash2, hash3, vhash, 512 ); + dintrlv_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c index 19c2417..9c58b1c 100644 --- a/algo/yespower/yespower-blake2b.c +++ b/algo/yespower/yespower-blake2b.c @@ -49,6 +49,7 @@ * no slowdown from the prefixes is generally observed on AMD CPUs supporting * XOP, some slowdown is sometimes observed on Intel CPUs with AVX. */ +/* #ifdef __XOP__ #warning "Note: XOP is enabled. That's great." #elif defined(__AVX__) @@ -60,6 +61,7 @@ #else #warning "Note: building generic code for non-x86. That's OK." #endif +*/ /* * The SSE4 code version has fewer instructions than the generic SSE2 version, diff --git a/build-allarch.sh b/build-allarch.sh index 63e0e95..f6b759d 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -16,7 +16,8 @@ mv cpuminer cpuminer-avx512 make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl +# GCC 9 doesn't include AES with core-avx2 +CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe mv cpuminer.exe cpuminer-avx2.exe @@ -25,7 +26,7 @@ mv cpuminer cpuminer-avx2 make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl +CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl make -j 16 strip -s cpuminer.exe mv cpuminer.exe cpuminer-aes-avx.exe diff --git a/build-allarch.sh.bak b/build-allarch.sh.bak new file mode 100755 index 0000000..63e0e95 --- /dev/null +++ b/build-allarch.sh.bak @@ -0,0 +1,86 @@ +#!/bin/bash +# +# This script is not intended for users, it is only used for compile testing +# during develpment. Howver the information contained my provide cimpilation +# tips to users. + +make distclean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-avx512.exe +strip -s cpuminer +mv cpuminer cpuminer-avx512 + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-avx2.exe +strip -s cpuminer +mv cpuminer cpuminer-avx2 + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-aes-avx.exe +strip -s cpuminer +mv cpuminer cpuminer-aes-avx + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-aes-sse42.exe +strip -s cpuminer +mv cpuminer cpuminer-aes-sse42 + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-sse42.exe +strip -s cpuminer +mv cpuminer cpuminer-sse42 + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-ssse3.exe +strip -s cpuminer +mv cpuminer cpuminer-ssse3 + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-sse2.exe +strip -s cpuminer +mv cpuminer cpuminer-sse2 + +make clean || echo done +rm -f config.status +CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-zen.exe +strip -s cpuminer +mv cpuminer cpuminer-zen + +make clean || echo done +rm -f config.status +CFLAGS="-O3 -march=native -Wall" ./configure --with-curl +make -j 16 +strip -s cpuminer.exe +strip -s cpuminer + diff --git a/configure b/configure index 8789e30..496cc57 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.11. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.9.11' -PACKAGE_STRING='cpuminer-opt 3.9.11' +PACKAGE_VERSION='3.10.0' +PACKAGE_STRING='cpuminer-opt 3.10.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.11 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.10.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.11:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.10.0:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.11 +cpuminer-opt configure 3.10.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.11, which was +It was created by cpuminer-opt $as_me 3.10.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.9.11' + VERSION='3.10.0' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.11, which was +This file was extended by cpuminer-opt $as_me 3.10.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.11 +cpuminer-opt config.status 3.10.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 05c55a9..8bb8706 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.9.11]) +AC_INIT([cpuminer-opt], [3.10.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 38626ad..eafe3b5 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -53,6 +53,8 @@ #if HAVE_SYS_PARAM_H #include #endif + +// GCC 9 warning sysctl.h is deprecated #include #endif #endif @@ -3339,12 +3341,14 @@ bool check_cpu_capability () bool cpu_has_avx2 = has_avx2(); bool cpu_has_sha = has_sha(); bool cpu_has_avx512 = has_avx512(); + bool cpu_has_vaes = has_vaes(); bool sw_has_aes = false; bool sw_has_sse42 = false; bool sw_has_avx = false; bool sw_has_avx2 = false; bool sw_has_avx512 = false; bool sw_has_sha = false; + bool sw_has_vaes = false; set_t algo_features = algo_gate.optimizations; bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); bool algo_has_aes = set_incl( AES_OPT, algo_features ); @@ -3352,12 +3356,14 @@ bool check_cpu_capability () bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); bool algo_has_sha = set_incl( SHA_OPT, algo_features ); + bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); bool use_aes; bool use_sse2; bool use_sse42; bool use_avx2; bool use_avx512; bool use_sha; + bool use_vaes; bool use_none; #ifdef __AES__ @@ -3372,12 +3378,16 @@ bool check_cpu_capability () #ifdef __AVX2__ sw_has_avx2 = true; #endif - #if (defined(__AVX512F__) && defined(__AVX51DQF__) && defined(__AVX51BW__) && defined(__AVX512VL__)) + #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) sw_has_avx512 = true; #endif #ifdef __SHA__ sw_has_sha = true; #endif + #ifdef __VAES__ + sw_has_vaes = true; + #endif + // #if !((__AES__) || (__SSE2__)) // printf("Neither __AES__ nor __SSE2__ defined.\n"); @@ -3404,6 +3414,7 @@ bool check_cpu_capability () if ( cpu_has_avx2 ) printf( " AVX2" ); if ( cpu_has_avx512 ) printf( " AVX512" ); if ( cpu_has_sha ) printf( " SHA" ); + if ( cpu_has_vaes ) printf( " VAES" ); printf(".\nSW features: SSE2"); if ( sw_has_aes ) printf( " AES" ); @@ -3412,18 +3423,20 @@ bool check_cpu_capability () if ( sw_has_avx2 ) printf( " AVX2" ); if ( sw_has_avx512 ) printf( " AVX512" ); if ( sw_has_sha ) printf( " SHA" ); + if ( sw_has_vaes ) printf( " VAES" ); printf(".\nAlgo features:"); if ( algo_features == EMPTY_SET ) printf( " None" ); else { - if ( algo_has_sse2 ) printf( " SSE2" ); - if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sse42 ) printf( " SSE4.2" ); + if ( algo_has_sse2 ) printf( " SSE2" ); + if ( algo_has_aes ) printf( " AES" ); + if ( algo_has_sse42 ) printf( " SSE4.2" ); if ( algo_has_avx2 ) printf( " AVX2" ); if ( algo_has_avx512 ) printf( " AVX512" ); if ( algo_has_sha ) printf( " SHA" ); + if ( algo_has_vaes ) printf( " VAES" ); } printf(".\n"); @@ -3461,8 +3474,9 @@ bool check_cpu_capability () use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; + use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 || - use_sha ); + use_sha || use_vaes ); // Display best options printf( "Start mining with" ); diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 210d9eb..fe3f00f 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -575,12 +575,26 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) __m128i s3 = casti_m128i( src,3 ); __m128i s4 = casti_m128i( src,4 ); +#if defined(__SSSE3__) + + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + +#else + s0 = mm128_bswap_32( s0 ); s1 = mm128_bswap_32( s1 ); s2 = mm128_bswap_32( s2 ); s3 = mm128_bswap_32( s3 ); s4 = mm128_bswap_32( s4 ); +#endif + casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 ); casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 ); casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa ); @@ -742,17 +756,18 @@ static inline void extr_lane_8x32( void *d, const void *s, static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) { - __m128i s0 = casti_m128i( src,0 ); - __m128i s1 = casti_m128i( src,1 ); - __m128i s2 = casti_m128i( src,2 ); - __m128i s3 = casti_m128i( src,3 ); - __m128i s4 = casti_m128i( src,4 ); + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); - s0 = mm128_bswap_32( s0 ); - s1 = mm128_bswap_32( s1 ); - s2 = mm128_bswap_32( s2 ); - s3 = mm128_bswap_32( s3 ); - s4 = mm128_bswap_32( s4 ); + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); casti_m128i( d, 0 ) = casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0 , 0x00 ); @@ -960,17 +975,18 @@ static inline void extr_lane_16x32( void *d, const void *s, static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) { - __m128i s0 = casti_m128i( src,0 ); - __m128i s1 = casti_m128i( src,1 ); - __m128i s2 = casti_m128i( src,2 ); - __m128i s3 = casti_m128i( src,3 ); - __m128i s4 = casti_m128i( src,4 ); + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); - s0 = mm128_bswap_32( s0 ); - s1 = mm128_bswap_32( s1 ); - s2 = mm128_bswap_32( s2 ); - s3 = mm128_bswap_32( s3 ); - s4 = mm128_bswap_32( s4 ); + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); casti_m128i( d, 0 ) = casti_m128i( d, 1 ) = @@ -1374,17 +1390,18 @@ static inline void extr_lane_4x64( void *d, const void *s, static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { - __m128i s0 = casti_m128i( src, 0 ); - __m128i s1 = casti_m128i( src, 1 ); - __m128i s2 = casti_m128i( src, 2 ); - __m128i s3 = casti_m128i( src, 3 ); - __m128i s4 = casti_m128i( src, 4 ); + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); - s0 = mm128_bswap_32( s0 ); - s1 = mm128_bswap_32( s1 ); - s2 = mm128_bswap_32( s2 ); - s3 = mm128_bswap_32( s3 ); - s4 = mm128_bswap_32( s4 ); + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); casti_m128i( d, 0 ) = casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 ); @@ -1556,7 +1573,7 @@ static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2, __m128i *d3 = (__m128i*)dst3; __m128i *d4 = (__m128i*)dst4; __m128i *d5 = (__m128i*)dst5; - __m128i *d6 = (__m128i*)dst5; + __m128i *d6 = (__m128i*)dst6; __m128i *d7 = (__m128i*)dst7; const __m128i* s = (const __m128i*)src; @@ -1690,17 +1707,18 @@ static inline void extr_lane_8x64( void *d, const void *s, static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) { - __m128i s0 = casti_m128i( src, 0 ); - __m128i s1 = casti_m128i( src, 1 ); - __m128i s2 = casti_m128i( src, 2 ); - __m128i s3 = casti_m128i( src, 3 ); - __m128i s4 = casti_m128i( src, 4 ); + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + __m128i s0 = casti_m128i( src,0 ); + __m128i s1 = casti_m128i( src,1 ); + __m128i s2 = casti_m128i( src,2 ); + __m128i s3 = casti_m128i( src,3 ); + __m128i s4 = casti_m128i( src,4 ); - s0 = mm128_bswap_32( s0 ); - s1 = mm128_bswap_32( s1 ); - s2 = mm128_bswap_32( s2 ); - s3 = mm128_bswap_32( s3 ); - s4 = mm128_bswap_32( s4 ); + s0 = _mm_shuffle_epi8( s0, bswap_shuf ); + s1 = _mm_shuffle_epi8( s1, bswap_shuf ); + s2 = _mm_shuffle_epi8( s2, bswap_shuf ); + s3 = _mm_shuffle_epi8( s3, bswap_shuf ); + s4 = _mm_shuffle_epi8( s4, bswap_shuf ); casti_m128i( d, 0 ) = casti_m128i( d, 1 ) = @@ -1746,7 +1764,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) casti_m128i( d, 37 ) = casti_m128i( d, 38 ) = casti_m128i( d, 39 ) = _mm_shuffle_epi32( s4, 0xee ); - } #endif // AVX512 @@ -1967,6 +1984,68 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src, #undef RLEAVE_4x64_4x32 +#define RLEAVE_8x64_8x32( i ) do \ +{ \ + uint32_t *d = (uint32_t*)dst + (i); \ + const uint32_t *s = (const uint32_t*)src + (i); \ + d[ 0] = s[ 0]; d[ 1] = s[ 2]; d[ 2] = s[ 4]; d[ 3] = s[ 6]; \ + d[ 4] = s[ 8]; d[ 5] = s[10]; d[ 6] = s[12]; d[ 7] = s[14]; \ + d[ 8] = s[ 1]; d[ 9] = s[ 3]; d[10] = s[ 5]; d[11] = s[ 7]; \ + d[12] = s[ 9]; d[13] = s[11]; d[14] = s[13]; d[16] = s[15]; \ +} while(0) + + +// 8x64 -> 8x32 + +static inline void rintrlv_8x64_8x32( void *dst, const void *src, + const int bit_len ) +{ + RLEAVE_8x64_8x32( 0 ); RLEAVE_8x64_8x32( 16 ); + RLEAVE_8x64_8x32( 32 ); RLEAVE_8x64_8x32( 48 ); + RLEAVE_8x64_8x32( 64 ); RLEAVE_8x64_8x32( 80 ); + RLEAVE_8x64_8x32( 96 ); RLEAVE_8x64_8x32( 112 ); + + RLEAVE_8x64_8x32( 128 ); RLEAVE_8x64_8x32( 144 ); + RLEAVE_8x64_8x32( 160 ); RLEAVE_8x64_8x32( 176 ); + RLEAVE_8x64_8x32( 192 ); RLEAVE_8x64_8x32( 208 ); + RLEAVE_8x64_8x32( 224 ); RLEAVE_8x64_8x32( 240 ); + + if ( bit_len <= 256 ) return; + + RLEAVE_8x64_8x32( 256 ); RLEAVE_8x64_8x32( 272 ); + RLEAVE_8x64_8x32( 288 ); RLEAVE_8x64_8x32( 304 ); + RLEAVE_8x64_8x32( 320 ); RLEAVE_8x64_8x32( 336 ); + RLEAVE_8x64_8x32( 352 ); RLEAVE_8x64_8x32( 368 ); + + RLEAVE_8x64_8x32( 384 ); RLEAVE_8x64_8x32( 400 ); + RLEAVE_8x64_8x32( 416 ); RLEAVE_8x64_8x32( 432 ); + RLEAVE_8x64_8x32( 448 ); RLEAVE_8x64_8x32( 464 ); + RLEAVE_8x64_8x32( 480 ); RLEAVE_8x64_8x32( 496 ); + + if ( bit_len <= 512 ) return; + + RLEAVE_8x64_8x32( 512 ); RLEAVE_8x64_8x32( 528 ); + RLEAVE_8x64_8x32( 544 ); RLEAVE_8x64_8x32( 560 ); + RLEAVE_8x64_8x32( 576 ); RLEAVE_8x64_8x32( 592 ); + RLEAVE_8x64_8x32( 608 ); RLEAVE_8x64_8x32( 624 ); + + RLEAVE_8x64_8x32( 640 ); RLEAVE_8x64_8x32( 656 ); + RLEAVE_8x64_8x32( 672 ); RLEAVE_8x64_8x32( 688 ); + RLEAVE_8x64_8x32( 704 ); RLEAVE_8x64_8x32( 720 ); + RLEAVE_8x64_8x32( 736 ); RLEAVE_8x64_8x32( 752 ); + + RLEAVE_8x64_8x32( 768 ); RLEAVE_8x64_8x32( 784 ); + RLEAVE_8x64_8x32( 800 ); RLEAVE_8x64_8x32( 816 ); + RLEAVE_8x64_8x32( 832 ); RLEAVE_8x64_8x32( 848 ); + RLEAVE_8x64_8x32( 864 ); RLEAVE_8x64_8x32( 880 ); + + RLEAVE_8x64_8x32( 896 ); RLEAVE_8x64_8x32( 912 ); + RLEAVE_8x64_8x32( 928 ); RLEAVE_8x64_8x32( 944 ); + RLEAVE_8x64_8x32( 960 ); RLEAVE_8x64_8x32( 976 ); + RLEAVE_8x64_8x32( 992 ); RLEAVE_8x64_8x32(1008 ); +} + +#undef RLEAVE_8x64_8x32 // 4x32 -> 4x64 @@ -2067,7 +2146,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0, d[13] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] ); d[14] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] ); d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] ); - if ( bit_len <= 256 ) return; + if ( bit_len <= 512 ) return; d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] ); d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] ); d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] ); @@ -2189,15 +2268,15 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, #if defined(__SSE4_1__) // No SSE2 implementation. -#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) -#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) +//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f ) +//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 ) #endif // SSE4_1 #if defined(__AVX2__) -#define mm256_intrlv_blend_128( hi, lo ) _mm256_blend_epi32( hi, lo, 0x0f ) -#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 ) +//#define mm256_intrlv_blend_128( hi, lo ) _mm256_blend_epi32( hi, lo, 0x0f ) +//#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 ) #define mm256_intrlv_blend_32( hi, lo ) _mm256_blend_epi32( hi, lo, 0x55 ) // Select lanes of 32 byte hash from 2 sources according to control mask. @@ -2216,4 +2295,18 @@ do { \ #endif // AVX2 +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +/* +#define mm512_intrlv_blend_128( hi, lo ) \ + _mm512_mask_blend_epi32( 0x0f0f, hi, lo ) + +#define mm512_intrlv_blend_64( hi, lo ) \ + _mm512_mask_blend_epi32( 0x3333, hi, lo ) +*/ + +#define mm512_intrlv_blend_32( hi, lo ) \ + _mm512_mask_blend_epi32( 0x5555, hi, lo ) + +#endif // AVX512 #endif // INTERLEAVE_H__ diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index c25e892..deec824 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -242,7 +242,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) -/* + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define mm128_ror_64 _mm_ror_epi64 @@ -251,14 +251,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_32 _mm_rol_epi32 #else -*/ + #define mm128_ror_64 mm128_ror_var_64 #define mm128_rol_64 mm128_rol_var_64 #define mm128_ror_32 mm128_ror_var_32 #define mm128_rol_32 mm128_rol_var_32 -//#endif // AVX512 else +#endif // AVX512 else #define mm128_ror_16( v, c ) \ _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index ba0ee17..4e82141 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -233,7 +233,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_or_si256( _mm256_slli_epi32( v, c ), \ _mm256_srli_epi32( v, 32-(c) ) ) -/* + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // AVX512, control must be 8 bit immediate. @@ -244,7 +244,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_rol_32 _mm256_rol_epi32 #else -*/ + // No AVX512, use fallback. @@ -253,7 +253,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_ror_32 mm256_ror_var_32 #define mm256_rol_32 mm256_rol_var_32 -// #endif // AVX512 else +#endif // AVX512 else #define mm256_ror_16( v, c ) \ _mm256_or_si256( _mm256_srli_epi16( v, c ), \ @@ -311,7 +311,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // AVX512 has finer granularity full vector permutes. // AVX512 has full vector alignr which might be faster, especially for 32 bit -/* + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define mm256_swap_128( v ) _mm256_alignr_epi64( v, v, 2 ) @@ -323,7 +323,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_rol_3x32( v ) _mm256_alignr_epi32( v, v, 5 ) #else // AVX2 -*/ // Swap 128 bit elements in 256 bit vector. #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) @@ -354,7 +353,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) m256_const_64( 0x0000000400000003, 0x0000000200000001, \ 0x0000000000000007, 0x0000000600000005 ) -//#endif // AVX512 else AVX2 +#endif // AVX512 else AVX2 // AVX512 can do 16 & 8 bit elements. @@ -423,21 +422,25 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_ror1x32_128( v ) _mm256_shuffle_epi32( v, 0x39 ) #define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 ) -// Rotate each 128 bit lane by one 16 bit element. #define mm256_ror1x16_128( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x01000f0e0d0c0b0a, \ - 0x0908070605040302 ) ) -#define mm256_rol1x16_128( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080706, \ - 0x0504030201000f0e ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \ + 0x01000f0e0d0c0b0a, 0x0908070605040302 ) ) + +#define mm256_rol1x16_128( v ) \ + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \ + 0x0d0c0b0a09080706, 0x0504030201000f0e ) ) -// Rotate each 128 bit lane by one byte #define mm256_ror1x8_128( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x000f0e0d0c0b0a09, \ - 0x0807060504030201 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \ + 0x000f0e0d0c0b0a09, 0x0807060504030201 ) ) + #define mm256_rol1x8_128( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \ - 0x0504030201000706 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \ + 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) // Rotate each 128 bit lane by c bytes. #define mm256_bror_128( v, c ) \ @@ -451,50 +454,65 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 ) #define mm256_ror1x16_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x09080f0e0d0c0b0a, \ - 0x0100070605040302 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \ + 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) + #define mm256_rol1x16_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \ - 0x0504030201000706 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \ + 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) #define mm256_ror1x8_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x080f0e0d0c0b0a09, \ - 0x0007060504030201 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \ + 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) + #define mm256_rol1x8_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0d0c0b0a09080f, \ - 0x0605040302010007 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \ + 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) #define mm256_ror3x8_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0a09080f0e0d0c0b, \ - 0x0201000706050403 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \ + 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) + #define mm256_rol3x8_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0b0a09080f0e0d, \ - 0x0403020100070605 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \ + 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) + // Swap 16 bit elements in each 32 bit lane #define mm256_swap16_32( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0b0a09080f0e0d0c, \ - 0x0302010007060504 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \ + 0x0b0a09080f0e0d0c, 0x0302010007060504 ) ) // // Swap bytes in vector elements, endian bswap. #define mm256_bswap_64( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) #define mm256_bswap_32( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) #define mm256_bswap_16( v ) \ - _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0f0c0d0a0b0809, \ - 0x0607040502030001 ) ) + _mm256_shuffle_epi8( v, \ + m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \ + 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) ) // Source and destination are pointers, may point to same memory. // 8 byte qword * 8 qwords * 4 lanes = 256 bytes #define mm256_block_bswap_64( d, s ) do \ { \ - __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ @@ -508,7 +526,8 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // 4 byte dword * 8 dwords * 8 lanes = 256 bytes #define mm256_block_bswap_32( d, s ) do \ { \ - __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 9c06ac8..59c2f28 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -90,7 +90,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, // Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants // to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}. -static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2, +static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, const uint64_t i1, const uint64_t i0 ) { __m256i lo = mm256_mov64_256( i0 ); @@ -105,7 +105,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2, // Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all // 128 bit lanes. -#define mm512_const2_64( i1, i0 ) \ +#define m512_const2_64( i1, i0 ) \ _mm512_permutex_epi64( _mm512_castsi128_si512( \ m128_const_64( i1, i0 ) ), 0x44 ) @@ -132,7 +132,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2, #define m512_one_16 _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) ) #define m512_one_8 _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) ) -#define m512_neg1 mm512_const1_64( 0xffffffffffffffff ) +#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) /* // EVEX vcmpeqq returns a bit mask instead of a vector @@ -173,6 +173,19 @@ static inline __m512i mm512_neg1_fn() // returns p+o as pointer to vector #define casto_m512i(p,o) (((__m512i*)(p))+(o)) +// +// Memory functions +// n = number of 512 bit (64 byte) vectors + +static inline void memset_zero_512( __m512i *dst, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = m512_zero; } + +static inline void memset_512( __m512i *dst, const __m512i a, const int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + +static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + // Sum 4 values, fewer dependencies than sequential addition. @@ -189,7 +202,7 @@ static inline __m512i mm512_neg1_fn() _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) ) #define mm512_xor4( a, b, c, d ) \ - _mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) ) + _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) ) @@ -212,6 +225,11 @@ static inline __m512i mm512_neg1_fn() // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32 // +#define mm512_ror_64 _mm512_ror_epi64 +#define mm512_rol_64 _mm512_rol_epi64 +#define mm512_ror_32 _mm512_ror_epi32 +#define mm512_rol_32 _mm512_rol_epi32 + #define mm512_ror_var_64( v, c ) \ _mm512_or_si512( _mm512_srli_epi64( v, c ), \ _mm512_slli_epi64( v, 64-(c) ) ) @@ -249,22 +267,34 @@ static inline __m512i mm512_neg1_fn() // Swap bytes in vector elements, vectorized endian conversion. #define mm512_bswap_64( v ) \ - _mm512_shuffle_epi8( v, m512_const2_64( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ + 0x28292a2b2c2d2e2f, 0x2021222324252627, \ + 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 )) #define mm512_bswap_32( v ) \ - _mm512_shuffle_epi8( v, m512_const2_64( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ + 0x2c2d2e2f28292a2b, 0x2425262720212223, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203, \ + 0x1c1d1e1f18191a1b, 0x1415161710111213 ) ) #define mm512_bswap_16( v ) \ - _mm512_shuffle_epi8( v, m512_const2_64( \ - 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) + _mm512_shuffle_epi8( v, \ + m512_const_64( 0x3e3f3c3d3a3b3839, 0x3637343532333031, \ + 0x2e2f2c2d2a2b2829, 0x2627242522232021, \ + 0x1e1f1c1d1a1b1819, 0x1617141512131011, \ + 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) // Source and destination are pointers, may point to same memory. // 8 lanes of 64 bytes each #define mm512_block_bswap_64( d, s ) do \ { \ - __m512i ctl = m512_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ + 0x28292a2b2c2d2e2f, 0x2021222324252627, \ + 0x18191a1b1c1d1e1f, 0x1011121314151617, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ @@ -278,7 +308,10 @@ static inline __m512i mm512_neg1_fn() // 16 lanes of 32 bytes each #define mm512_block_bswap_32( d, s ) do \ { \ - __m512i ctl = m512_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ + 0x2c2d2e2f28292a2b, 0x2425262720212223, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203, \ + 0x1c1d1e1f18191a1b, 0x1415161710111213 ); \ casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ @@ -381,6 +414,8 @@ static inline __m512i mm512_neg1_fn() #define mm512_ror1x64_256( v ) _mm512_permutex_epi64( v, 0x39 ) #define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 ) + +/* Need to fix // Rotate 256 bit lanes by one 32 bit element #define mm512_ror1x32_256( v ) \ _mm512_permutexvar_epi32( m512_const4_64( \ @@ -411,7 +446,7 @@ static inline __m512i mm512_neg1_fn() _mm512_shuffle_epi8( v, m512_const4_64( \ 0x1e1d1c1b1a191817, 0x161514131211100f, \ 0x0e0d0c0b0a090807, 0x060504030201001f ), v ) - +*/ // // Rotate elements within 128 bit lanes of 512 bit vector. @@ -422,6 +457,7 @@ static inline __m512i mm512_neg1_fn() #define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 ) #define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 ) +/* #define mm512_ror1x16_128( v ) \ _mm512_permutexvar_epi16( m512_const2_64( \ 0x0000000700060005, 0x0004000300020001 ), v ) @@ -437,6 +473,7 @@ static inline __m512i mm512_neg1_fn() #define mm512_rol1x8_128( v ) \ _mm512_shuffle_epi8( v, m512_const2_64( \ 0x0e0d0c0b0a090807, 0x060504030201000f ) ) +*/ // Rotate 128 bit lanes by c bytes. #define mm512_bror_128( v, c ) \ diff --git a/sysinfos.c b/sysinfos.c index 76f9815..704f25c 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -18,14 +18,47 @@ #ifndef WIN32 +// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input +// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp +// ryzen has no /sys/devices/platform/coretemp.0 +// ryzen: /sys/class/hwmon/hwmon0 +// 2400: /sys/class/hwmon/hwmon0/temp1_input incorrect temp +// 2400 has no /sys/class/hwmon/hwmon2/temp1_input +// 2400 /sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input ok +// 6700 /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input +// 6700 /sys/class/hwmon/hwmon2/temp1_input +// /sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input never exists +// /sys/class/hwmon/hwmon0/temp2_input doesn't exist or shows wrong temp (sys16) +// /sys/class/hwmon/hwmon0/device/temp1_input doesn't exist + + +// the first 3 will find i5-2400, i7-6700k, r7-1700, i5-1035g1. +// The others are left in for legacy, some should probably be removed. +#define HWMON_PATH1 \ + "/sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input" + +#define HWMON_PATH2 \ + "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input" + +#define HWMON_PATH3 \ + "/sys/class/hwmon/hwmon0/temp1_input" + #define HWMON_PATH \ "/sys/class/hwmon/hwmon2/temp1_input" + +/* #define HWMON_ALT \ "/sys/class/hwmon/hwmon0/temp1_input" + #define HWMON_ALT1 \ "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input" +*/ + +// This shows wrong temp on i5-1035g1 #define HWMON_ALT2 \ "/sys/class/hwmon/hwmon1/temp1_input" + +// None of these work on any of the cpus above. #define HWMON_ALT3 \ "/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input" #define HWMON_ALT4 \ @@ -33,16 +66,28 @@ #define HWMON_ALT5 \ "/sys/class/hwmon/hwmon0/device/temp1_input" + static inline float linux_cputemp(int core) { float tc = 0.0; - FILE *fd = fopen(HWMON_PATH, "r"); + FILE *fd; uint32_t val = 0; - if (!fd) - fd = fopen(HWMON_ALT, "r"); + fd = fopen(HWMON_PATH1, "r"); + + if (!fd) + fd = fopen(HWMON_PATH2, "r"); + + if (!fd) + fd = fopen(HWMON_PATH3, "r"); + + if (!fd) + fd = fopen(HWMON_PATH, "r"); if (!fd) +// fd = fopen(HWMON_ALT1, "r"); + +// if (!fd) fd = fopen(HWMON_ALT2, "r"); if (!fd) @@ -52,14 +97,14 @@ static inline float linux_cputemp(int core) fd = fopen(HWMON_ALT4, "r"); if (!fd) - fd = fopen(HWMON_ALT5, "r"); + fd = fopen(HWMON_ALT5, "r"); if (!fd) return tc; - if (fscanf(fd, "%d", &val)) + if ( fscanf( fd, "%d", &val ) ) tc = val / 1000.0; - fclose(fd); + fclose( fd ); return tc; } @@ -296,7 +341,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) // EXTENDED_FEATURES ECX #define AVX512VBMI_Flag (1<<1) #define AVX512VBMI2_Flag (1<<6) -#define AVX512VAES_Flag (1<<9) +#define VAES_Flag (1<<9) // Use this to detect presence of feature @@ -418,14 +463,14 @@ static inline bool has_avx512() #endif } -static inline bool has_avx512vaes() +static inline bool has_vaes() { #ifdef __arm__ return false; #else int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, cpu_info ); - return cpu_info[ ECX_Reg ] & AVX512VAES_Flag; + return cpu_info[ ECX_Reg ] & VAES_Flag; #endif } diff --git a/winbuild-cross.sh b/winbuild-cross.sh index f4585df..a5100cb 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -41,27 +41,22 @@ make -j 16 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-zen.exe -#make clean || echo clean -#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS -#make -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-avx-sha.exe +# mingw won't compile avx512 without -fno-asynchronous-unwind-tables +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-avx512.exe make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS +# GCC 9 doesn't include AES in core-avx2 +CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS make -j 16 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS -#make -j -#strip -s cpuminer.exe -#mv cpuminer.exe release/cpuminer-aes-sha.exe - - make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS diff --git a/winbuild-cross.sh.bak b/winbuild-cross.sh.bak new file mode 100755 index 0000000..f4585df --- /dev/null +++ b/winbuild-cross.sh.bak @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Script for building Windows binaries release package using mingw. +# Requires a custom mingw environment, not intended for users. +# +# Compiles Windows EXE files for selected CPU architectures, copies them +# as well as some DLLs that aren't available in most Windows environments +# into a release folder ready to be zipped and uploaded. + +# define some local variables + +export LOCAL_LIB="$HOME/usr/lib" + +export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" + +export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32" + +# make link to local gmp header file. +ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h + +# edit configure to fix pthread lib name for Windows. +#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac + +# make release directory and copy selected DLLs. +mkdir release +cp README.txt release/ +cp README.md release/ +cp RELEASE_NOTES release/ +cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/ +cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ +cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/ +cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/ +cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ +cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ + +make distclean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-zen.exe + +#make clean || echo clean +#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS +#make +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-avx-sha.exe + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-avx2.exe + +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS +#make -j +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-aes-sha.exe + + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-avx.exe + +# -march=westmere is supported in gcc5 +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS +#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-aes-sse42.exe + +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS +#make +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-sse42.exe + +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS +#make +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-ssse3.exe +#make clean || echo clean + +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS +make -j 16 +strip -s cpuminer.exe +mv cpuminer.exe release/cpuminer-sse2.exe +make clean || echo clean +