From 2cd1507c2e59c592f40be02d723a974644357808 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Wed, 29 Sep 2021 17:31:16 -0400 Subject: [PATCH] v3.7.4 --- Makefile.am | 3 + RELEASE_NOTES | 31 + algo-gate-api.h | 4 + algo/argon2/argon2d/blake2/blamka-round-opt.h | 18 +- algo/bmw/bmw512-hash-4way.c | 471 +- algo/cubehash/cube-hash-2way.c | 246 +- algo/cubehash/cube-hash-2way.h | 44 +- algo/cubehash/cubehash_sse2.c | 12 +- algo/groestl/aes_ni/hash-groestl.h | 3 +- algo/groestl/aes_ni/hash-groestl256.h | 3 +- algo/groestl/myr-groestl.c | 7 +- algo/hamsi/hamsi-hash-4way.c | 218 +- algo/hodl/hodl-wolf.c | 1 + algo/keccak/keccak-gate.c | 1 + algo/keccak/keccak-hash-4way.c | 14 +- algo/lyra2/allium-4way.c | 53 +- algo/lyra2/sponge.h | 30 +- algo/m7m/m7m.c | 24 +- algo/ripemd/lbry.c | 27 +- algo/scrypt/neoscrypt.c | 8 +- algo/scrypt/scrypt-core-4way.c | 3981 +++++++++++++++++ algo/scrypt/scrypt-core-4way.h | 70 + algo/scrypt/scrypt-core-ref.c | 206 + algo/scrypt/scrypt.c | 1476 ++++-- algo/sha/hmac-sha256-hash.c | 50 +- algo/sha/hmac-sha256-hash.h | 8 +- algo/sha/sha-hash-4way.h | 14 +- algo/sha/sha2.c | 8 +- algo/sha/sha256-hash-2way-ni.c | 348 +- algo/sha/sha256-hash-4way.c | 473 +- algo/sha/sha256-hash-opt.c | 192 +- algo/sha/sha256-hash-opt.h | 18 - algo/sha/sha256-hash.c | 142 + algo/sha/sha256-hash.h | 56 + algo/sha/sha256d-4way.c | 31 +- algo/sha/sha256d.c | 8 + algo/sha/sha256d.h | 7 + algo/sha/sha256q.c | 30 +- algo/sha/sha256t-4way.c | 23 +- algo/sha/sha256t.c | 118 +- algo/sha/sha512-hash-4way.c | 150 +- algo/sha/sph_sha2.c | 210 +- algo/sha/sph_sha2.h | 7 + algo/shavite/shavite-hash-2way.c | 52 +- algo/shavite/shavite-hash-4way.c | 54 +- algo/shavite/sph-shavite-aesni.c | 52 +- algo/skein/skein-4way.c | 21 +- algo/skein/skein.c | 13 +- algo/verthash/Verthash.c | 8 +- algo/verthash/verthash-gate.c | 4 +- algo/whirlpool/whirlpool.c | 2 +- algo/x16/x16r-4way.c | 143 +- algo/x16/x16r-gate.c | 1 + algo/x16/x16r-gate.h | 5 +- algo/x16/x21s-4way.c | 22 +- algo/x16/x21s.c | 8 +- algo/x17/x17-4way.c | 9 +- algo/x22/x22i-4way.c | 58 +- algo/x22/x22i.c | 6 +- algo/x22/x25x-4way.c | 56 +- algo/x22/x25x.c | 8 +- algo/yespower/crypto/blake2b-yp.c | 8 +- algo/yespower/yescrypt-r8g.c | 4 +- algo/yespower/yespower-gate.c | 13 +- algo/yespower/yespower-opt.c | 19 +- algo/yespower/yespower.h | 6 +- build-allarch.sh | 2 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 136 +- miner.h | 37 +- simd-utils.h | 2 + simd-utils/intrlv.h | 2 +- simd-utils/simd-128.h | 177 +- simd-utils/simd-256.h | 159 +- simd-utils/simd-512.h | 204 +- simd-utils/simd-64.h | 10 +- simd-utils/simd-int.h | 13 +- sysinfos.c | 2 +- util.c | 60 +- 80 files changed, 8145 insertions(+), 2097 deletions(-) create mode 100644 algo/scrypt/scrypt-core-4way.c create mode 100644 algo/scrypt/scrypt-core-4way.h create mode 100644 algo/scrypt/scrypt-core-ref.c delete mode 100644 algo/sha/sha256-hash-opt.h create mode 100644 algo/sha/sha256-hash.c create mode 100644 algo/sha/sha256-hash.h create mode 100644 algo/sha/sha256d.c create mode 100644 algo/sha/sha256d.h diff --git a/Makefile.am b/Makefile.am index a4adc3b..a4163b3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -158,7 +158,9 @@ cpuminer_SOURCES = \ algo/ripemd/lbry.c \ algo/ripemd/lbry-4way.c \ algo/scrypt/scrypt.c \ + algo/scrypt/scrypt-core-4way.c \ algo/scrypt/neoscrypt.c \ + algo/sha/sha256-hash.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ @@ -167,6 +169,7 @@ cpuminer_SOURCES = \ algo/sha/sha256-hash-2way-ni.c \ algo/sha/hmac-sha256-hash.c \ algo/sha/hmac-sha256-hash-4way.c \ + algo/sha/sha256d.c \ algo/sha/sha2.c \ algo/sha/sha256t-gate.c \ algo/sha/sha256t-4way.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3f6b080..056491f 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,37 @@ If not what makes it happen or not happen? Change Log ---------- +v3.18.0 + +Complete rewrite of Scrypt code, optimized for large N factor (scryptn2): + - AVX512 & SHA support for SHA256, AVX512 has priority, + - up to 50% increase in hashrate, + - memory requirements reduced 30-60% depending on CPU architecture, + - memory usage displayed at startup, + - scrypt, default N=1024 (LTC), will likely perform slower. + +Improved stale share detection and handling for Scrypt with large N factor: + - abort and discard partially computed hash when new work is detected, + - quicker response to new job, less time wasted mining stale job. + +Improved stale share handling for all algorithms: + - report possible stale share when new work received with a previously + submitted share still pending, + - when new work is detected report the submission of an already completed, + otherwise valid, but likely stale, share, + - fixed incorrect block height in stale share log. + +Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2. + +When stratum disconnects miner threads go to idle until reconnected. + +Colour changes to some logs. + +Some low level function name changes for clarity and consistency. + +The reference hashrate in the summary log and the benchmark total hashrate +are now the mean hashrate for the session. + v3.17.1 Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES. diff --git a/algo-gate-api.h b/algo-gate-api.h index 8d61d26..56594d5 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -1,3 +1,6 @@ +#ifndef __ALGO_GATE_API_H__ +#define __ALGO_GATE_API_H__ 1 + #include #include #include @@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata ); // algo name if valid alias, NULL if invalid alias or algo. void get_algo_alias( char **algo_or_alias ); +#endif diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 8156331..809961c 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { #include -#define ror64(x, n) _mm512_ror_epi64((x), (n)) +#define ROR64(x, n) _mm512_ror_epi64((x), (n)) static __m512i muladd(__m512i x, __m512i y) { @@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ror64(D0, 32); \ - D1 = ror64(D1, 32); \ + D0 = ROR64(D0, 32); \ + D1 = ROR64(D1, 32); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ror64(B0, 24); \ - B1 = ror64(B1, 24); \ + B0 = ROR64(B0, 24); \ + B1 = ROR64(B1, 24); \ } while ((void)0, 0) #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ror64(D0, 16); \ - D1 = ror64(D1, 16); \ + D0 = ROR64(D0, 16); \ + D1 = ROR64(D1, 16); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ror64(B0, 63); \ - B1 = ror64(B1, 63); \ + B0 = ROR64(B0, 63); \ + B1 = ROR64(B1, 63); \ } while ((void)0, 0) #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index 3587cc4..9ab4f89 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) #define rb6(x) mm256_rol_64( x, 43 ) #define rb7(x) mm256_rol_64( x, 53 ) -#define rol_off_64( M, j, off ) \ - mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) +#define rol_off_64( M, j ) \ + mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 ) -#define add_elt_b( M, H, j ) \ - _mm256_xor_si256( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \ - rol_off_64( M, j, 3 ) ), \ - rol_off_64( M, j, 10 ) ), \ - _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) +#define add_elt_b( mj0, mj3, mj10, h, K ) \ + _mm256_xor_si256( h, _mm256_add_epi64( K, \ + _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) ) - -#define expand1b( qt, M, H, i ) \ - _mm256_add_epi64( mm256_add4_64( \ +#define expand1_b( qt, i ) \ + mm256_add4_64( \ mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \ sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \ mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \ @@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \ sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \ mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \ - sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) + sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ) -#define expand2b( qt, M, H, i) \ - _mm256_add_epi64( mm256_add4_64( \ +#define expand2_b( qt, i) \ + mm256_add4_64( \ mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \ qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \ mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \ @@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst ) mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \ qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \ mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \ - sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \ - add_elt_b( M, H, (i)-16 ) ) - - + sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ) #define Wb0 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \ + _mm256_add_epi64( mh[13], mh[14] ) ) #define Wb1 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_xor_si256( M[11], H[11] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \ + _mm256_sub_epi64( mh[14], mh[15] ) ) #define Wb2 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \ + _mm256_sub_epi64( mh[12], mh[15] ) ) #define Wb3 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \ + _mm256_sub_epi64( mh[10], \ + mh[13] ) ) #define Wb4 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \ + _mm256_add_epi64( mh[11], mh[14] ) ) #define Wb5 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_xor_si256( M[10], H[10] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \ + _mm256_sub_epi64( mh[12], mh[15] ) ) #define Wb6 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \ + _mm256_sub_epi64( mh[11], mh[13] ) ) #define Wb7 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \ + _mm256_add_epi64( mh[12], mh[14] ) ) #define Wb8 \ _mm256_add_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[13], mh[15] ) ) #define Wb9 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ - _mm256_xor_si256( M[14], H[14] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 7], mh[14] ) ) #define Wb10 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 1], H[ 1] ) ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \ - _mm256_xor_si256( M[15], H[15] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \ + _mm256_sub_epi64( mh[ 7], mh[15] ) ) #define Wb11 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \ - _mm256_xor_si256( M[ 0], H[ 0] ) ), \ - _mm256_xor_si256( M[ 2], H[ 2] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \ - _mm256_xor_si256( M[ 9], H[ 9] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \ + _mm256_sub_epi64( mh[ 5], mh[ 9] ) ) #define Wb12 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \ - _mm256_xor_si256( M[ 3], H[ 3] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ - _mm256_xor_si256( M[10], H[10] ) ) ) + _mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 9], mh[10] ) ) #define Wb13 \ _mm256_add_epi64( \ - _mm256_add_epi64( \ - _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \ - _mm256_xor_si256( M[ 4], H[ 4] ) ), \ - _mm256_xor_si256( M[ 7], H[ 7] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \ - _mm256_xor_si256( M[11], H[11] ) ) ) + _mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \ + _mm256_add_epi64( mh[10], mh[11] ) ) #define Wb14 \ _mm256_sub_epi64( \ - _mm256_add_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \ - _mm256_xor_si256( M[ 5], H[ 5] ) ), \ - _mm256_xor_si256( M[ 8], H[ 8] ) ), \ - _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \ - _mm256_xor_si256( M[12], H[12] ) ) ) + _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \ + _mm256_add_epi64( mh[11], mh[12] ) ) #define Wb15 \ _mm256_sub_epi64( \ - _mm256_sub_epi64( \ - _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \ - _mm256_xor_si256( M[ 4], H[4] ) ), \ - _mm256_xor_si256( M[ 6], H[ 6] ) ), \ - _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \ - _mm256_xor_si256( M[13], H[13] ) ) ) + _mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \ + _mm256_sub_epi64( mh[ 9], mh[13] ) ) void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) { __m256i qt[32], xl, xh; + __m256i mh[16]; + int i; + + for ( i = 0; i < 16; i++ ) + mh[i] = _mm256_xor_si256( M[i], H[i] ); qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); @@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] ); qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); - qt[16] = expand1b( qt, M, H, 16 ); - qt[17] = expand1b( qt, M, H, 17 ); - qt[18] = expand2b( qt, M, H, 18 ); - qt[19] = expand2b( qt, M, H, 19 ); - qt[20] = expand2b( qt, M, H, 20 ); - qt[21] = expand2b( qt, M, H, 21 ); - qt[22] = expand2b( qt, M, H, 22 ); - qt[23] = expand2b( qt, M, H, 23 ); - qt[24] = expand2b( qt, M, H, 24 ); - qt[25] = expand2b( qt, M, H, 25 ); - qt[26] = expand2b( qt, M, H, 26 ); - qt[27] = expand2b( qt, M, H, 27 ); - qt[28] = expand2b( qt, M, H, 28 ); - qt[29] = expand2b( qt, M, H, 29 ); - qt[30] = expand2b( qt, M, H, 30 ); - qt[31] = expand2b( qt, M, H, 31 ); + + __m256i mj[16]; + for ( i = 0; i < 16; i++ ) + mj[i] = rol_off_64( M, i ); + + qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], + (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) ); + qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], + (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) ); + qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], + (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) ); + qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], + (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) ); + qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], + (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) ); + qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], + (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) ); + qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], + (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) ); + qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], + (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) ); + qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], + (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) ); + qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], + (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) ); + qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], + (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) ); + qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], + (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) ); + qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], + (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) ); + qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], + (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) ); + qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], + (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) ); + qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], + (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) ); + + qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) ); + qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) ); + qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) ); + qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) ); + qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) ); + qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) ); + qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) ); + qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) ); + qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) ); + qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) ); + qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) ); + qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) ); + qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) ); + qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) ); + qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) ); + qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) ); xl = _mm256_xor_si256( mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), @@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); - #define DH1L( m, sl, sr, a, b, c ) \ _mm256_add_epi64( \ _mm256_xor_si256( M[m], \ @@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define r8b6(x) mm512_rol_64( x, 43 ) #define r8b7(x) mm512_rol_64( x, 53 ) -#define rol8w_off_64( M, j, off ) \ - mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \ - ( ( (j) + (off) ) & 0xF ) + 1 ) +#define rol8w_off_64( M, j ) \ + mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 ) -#define add_elt_b8( M, H, j ) \ - _mm512_xor_si512( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \ - rol8w_off_64( M, j, 3 ) ), \ - rol8w_off_64( M, j, 10 ) ), \ - _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) +#define add_elt_b8( mj0, mj3, mj10, h, K ) \ + _mm512_xor_si512( h, _mm512_add_epi64( K, \ + _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) ) -#define expand1b8( qt, M, H, i ) \ - _mm512_add_epi64( mm512_add4_64( \ +#define expand1_b8( qt, i ) \ + mm512_add4_64( \ mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \ s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \ mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \ @@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \ s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \ mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \ - s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \ - add_elt_b8( M, H, (i)-16 ) ) + s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ) -#define expand2b8( qt, M, H, i) \ - _mm512_add_epi64( mm512_add4_64( \ +#define expand2_b8( qt, i) \ + mm512_add4_64( \ mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \ qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \ mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \ @@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \ qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \ mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \ - s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \ - add_elt_b8( M, H, (i)-16 ) ) + s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ) #define W8b0 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_xor_si512( M[10], H[10] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \ + _mm512_add_epi64( mh[13], mh[14] ) ) #define W8b1 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_xor_si512( M[11], H[11] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \ + _mm512_sub_epi64( mh[14], mh[15] ) ) #define W8b2 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \ + _mm512_sub_epi64( mh[12], mh[15] ) ) #define W8b3 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 1], H[ 1] ) ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \ + _mm512_sub_epi64( mh[10], mh[13] ) ) #define W8b4 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \ + _mm512_add_epi64( mh[11], mh[14] ) ) #define W8b5 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_xor_si512( M[10], H[10] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \ + _mm512_sub_epi64( mh[12], mh[15] ) ) #define W8b6 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \ - _mm512_xor_si512( M[ 0], H[ 0] ) ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \ + _mm512_sub_epi64( mh[11], mh[13] ) ) #define W8b7 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \ + _mm512_add_epi64( mh[12], mh[14] ) ) #define W8b8 \ _mm512_add_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[13], mh[15] ) ) #define W8b9 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ - _mm512_xor_si512( M[14], H[14] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 7], mh[14] ) ) #define W8b10 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ - _mm512_xor_si512( M[ 1], H[ 1] ) ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \ - _mm512_xor_si512( M[15], H[15] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \ + _mm512_sub_epi64( mh[ 7], mh[15] ) ) #define W8b11 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \ - _mm512_xor_si512( M[ 0], H[ 0] ) ), \ - _mm512_xor_si512( M[ 2], H[ 2] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \ - _mm512_xor_si512( M[ 9], H[ 9] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \ + _mm512_sub_epi64( mh[ 5], mh[ 9] ) ) #define W8b12 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \ - _mm512_xor_si512( M[ 3], H[ 3] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ - _mm512_xor_si512( M[10], H[10] ) ) ) + _mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 9], mh[10] ) ) #define W8b13 \ _mm512_add_epi64( \ - _mm512_add_epi64( \ - _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \ - _mm512_xor_si512( M[ 4], H[ 4] ) ), \ - _mm512_xor_si512( M[ 7], H[ 7] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \ - _mm512_xor_si512( M[11], H[11] ) ) ) + _mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \ + _mm512_add_epi64( mh[10], mh[11] ) ) #define W8b14 \ _mm512_sub_epi64( \ - _mm512_add_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \ - _mm512_xor_si512( M[ 5], H[ 5] ) ), \ - _mm512_xor_si512( M[ 8], H[ 8] ) ), \ - _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \ - _mm512_xor_si512( M[12], H[12] ) ) ) + _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \ + _mm512_add_epi64( mh[11], mh[12] ) ) #define W8b15 \ _mm512_sub_epi64( \ - _mm512_sub_epi64( \ - _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \ - _mm512_xor_si512( M[ 4], H[4] ) ), \ - _mm512_xor_si512( M[ 6], H[ 6] ) ), \ - _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \ - _mm512_xor_si512( M[13], H[13] ) ) ) + _mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \ + _mm512_sub_epi64( mh[ 9], mh[13] ) ) void compress_big_8way( const __m512i *M, const __m512i H[16], __m512i dH[16] ) { __m512i qt[32], xl, xh; + __m512i mh[16]; + int i; + + for ( i = 0; i < 16; i++ ) + mh[i] = _mm512_xor_si512( M[i], H[i] ); qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] ); qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] ); @@ -1268,22 +1169,60 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] ); qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); - qt[16] = expand1b8( qt, M, H, 16 ); - qt[17] = expand1b8( qt, M, H, 17 ); - qt[18] = expand2b8( qt, M, H, 18 ); - qt[19] = expand2b8( qt, M, H, 19 ); - qt[20] = expand2b8( qt, M, H, 20 ); - qt[21] = expand2b8( qt, M, H, 21 ); - qt[22] = expand2b8( qt, M, H, 22 ); - qt[23] = expand2b8( qt, M, H, 23 ); - qt[24] = expand2b8( qt, M, H, 24 ); - qt[25] = expand2b8( qt, M, H, 25 ); - qt[26] = expand2b8( qt, M, H, 26 ); - qt[27] = expand2b8( qt, M, H, 27 ); - qt[28] = expand2b8( qt, M, H, 28 ); - qt[29] = expand2b8( qt, M, H, 29 ); - qt[30] = expand2b8( qt, M, H, 30 ); - qt[31] = expand2b8( qt, M, H, 31 ); + + __m512i mj[16]; + for ( i = 0; i < 16; i++ ) + mj[i] = rol8w_off_64( M, i ); + + qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], + (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) ); + qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], + (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) ); + qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], + (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) ); + qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], + (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) ); + qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], + (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) ); + qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], + (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) ); + qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], + (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) ); + qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], + (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) ); + qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], + (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) ); + qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], + (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) ); + qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], + (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) ); + qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], + (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) ); + qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], + (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) ); + qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], + (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) ); + qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], + (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) ); + qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], + (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) ); + + qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) ); + qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) ); + qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) ); + qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) ); + qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) ); + qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) ); + qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) ); + qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) ); + qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) ); + qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) ); + qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) ); + qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) ); + qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) ); + qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) ); + qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) ); + qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) ); xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ), mm512_xor3( qt[19], qt[20], qt[21] ), diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 1201b8f..06f7e09 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp ) _mm512_store_si512( (__m512i*)sp->h + 7, x7 ); } +// 8 ways, 4 way parallel double buffered +static void transform_4way_2buf( cube_4way_2buf_context *sp ) +{ + int r; + const int rounds = sp->rounds; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7; + __m512i y0, y1, y2, y3, y4, y5, y6, y7; + __m512i tx0, tx1, ty0, ty1; + + x0 = _mm512_load_si512( (__m512i*)sp->h0 ); + x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 ); + x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 ); + x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 ); + x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 ); + x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 ); + x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 ); + x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 ); + + y0 = _mm512_load_si512( (__m512i*)sp->h1 ); + y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 ); + y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 ); + y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 ); + y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 ); + y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 ); + y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 ); + y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 ); + + + for ( r = 0; r < rounds; ++r ) + { + x4 = _mm512_add_epi32( x0, x4 ); + y4 = _mm512_add_epi32( y0, y4 ); + tx0 = x0; + ty0 = y0; + x5 = _mm512_add_epi32( x1, x5 ); + y5 = _mm512_add_epi32( y1, y5 ); + tx1 = x1; + ty1 = y1; + x0 = mm512_rol_32( x2, 7 ); + y0 = mm512_rol_32( y2, 7 ); + x6 = _mm512_add_epi32( x2, x6 ); + y6 = _mm512_add_epi32( y2, y6 ); + x1 = mm512_rol_32( x3, 7 ); + y1 = mm512_rol_32( y3, 7 ); + x7 = _mm512_add_epi32( x3, x7 ); + y7 = _mm512_add_epi32( y3, y7 ); + + + x2 = mm512_rol_32( tx0, 7 ); + y2 = mm512_rol_32( ty0, 7 ); + x0 = _mm512_xor_si512( x0, x4 ); + y0 = _mm512_xor_si512( y0, y4 ); + x4 = mm512_swap128_64( x4 ); + x3 = mm512_rol_32( tx1, 7 ); + y3 = mm512_rol_32( ty1, 7 ); + y4 = mm512_swap128_64( y4 ); + + x1 = _mm512_xor_si512( x1, x5 ); + y1 = _mm512_xor_si512( y1, y5 ); + x5 = mm512_swap128_64( x5 ); + x2 = _mm512_xor_si512( x2, x6 ); + y2 = _mm512_xor_si512( y2, y6 ); + y5 = mm512_swap128_64( y5 ); + x3 = _mm512_xor_si512( x3, x7 ); + y3 = _mm512_xor_si512( y3, y7 ); + + x6 = mm512_swap128_64( x6 ); + x4 = _mm512_add_epi32( x0, x4 ); + y4 = _mm512_add_epi32( y0, y4 ); + y6 = mm512_swap128_64( y6 ); + x5 = _mm512_add_epi32( x1, x5 ); + y5 = _mm512_add_epi32( y1, y5 ); + x7 = mm512_swap128_64( x7 ); + x6 = _mm512_add_epi32( x2, x6 ); + y6 = _mm512_add_epi32( y2, y6 ); + tx0 = x0; + ty0 = y0; + y7 = mm512_swap128_64( y7 ); + tx1 = x2; + ty1 = y2; + x0 = mm512_rol_32( x1, 11 ); + y0 = mm512_rol_32( y1, 11 ); + + x7 = _mm512_add_epi32( x3, x7 ); + y7 = _mm512_add_epi32( y3, y7 ); + + x1 = mm512_rol_32( tx0, 11 ); + y1 = mm512_rol_32( ty0, 11 ); + x0 = _mm512_xor_si512( x0, x4 ); + x4 = mm512_swap64_32( x4 ); + y0 = _mm512_xor_si512( y0, y4 ); + x2 = mm512_rol_32( x3, 11 ); + y4 = mm512_swap64_32( y4 ); + y2 = mm512_rol_32( y3, 11 ); + x1 = _mm512_xor_si512( x1, x5 ); + x5 = mm512_swap64_32( x5 ); + y1 = _mm512_xor_si512( y1, y5 ); + x3 = mm512_rol_32( tx1, 11 ); + y5 = mm512_swap64_32( y5 ); + y3 = mm512_rol_32( ty1, 11 ); + + x2 = _mm512_xor_si512( x2, x6 ); + x6 = mm512_swap64_32( x6 ); + y2 = _mm512_xor_si512( y2, y6 ); + y6 = mm512_swap64_32( y6 ); + x3 = _mm512_xor_si512( x3, x7 ); + x7 = mm512_swap64_32( x7 ); + y3 = _mm512_xor_si512( y3, y7 ); + + y7 = mm512_swap64_32( y7 ); + } + + _mm512_store_si512( (__m512i*)sp->h0, x0 ); + _mm512_store_si512( (__m512i*)sp->h0 + 1, x1 ); + _mm512_store_si512( (__m512i*)sp->h0 + 2, x2 ); + _mm512_store_si512( (__m512i*)sp->h0 + 3, x3 ); + _mm512_store_si512( (__m512i*)sp->h0 + 4, x4 ); + _mm512_store_si512( (__m512i*)sp->h0 + 5, x5 ); + _mm512_store_si512( (__m512i*)sp->h0 + 6, x6 ); + _mm512_store_si512( (__m512i*)sp->h0 + 7, x7 ); + + _mm512_store_si512( (__m512i*)sp->h1, y0 ); + _mm512_store_si512( (__m512i*)sp->h1 + 1, y1 ); + _mm512_store_si512( (__m512i*)sp->h1 + 2, y2 ); + _mm512_store_si512( (__m512i*)sp->h1 + 3, y3 ); + _mm512_store_si512( (__m512i*)sp->h1 + 4, y4 ); + _mm512_store_si512( (__m512i*)sp->h1 + 5, y5 ); + _mm512_store_si512( (__m512i*)sp->h1 + 6, y6 ); + _mm512_store_si512( (__m512i*)sp->h1 + 7, y7 ); +} + int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds, int blockbytes ) { @@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, return 0; } +int cube_4way_2buf_full( cube_4way_2buf_context *sp, + void *output0, void *output1, int hashbitlen, + const void *data0, const void *data1, size_t size ) +{ + __m512i *h0 = (__m512i*)sp->h0; + __m512i *h1 = (__m512i*)sp->h1; + __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512 + : (__m128i*)IV256 ); + sp->hashlen = hashbitlen/128; + sp->blocksize = 32/16; + sp->rounds = 16; + sp->pos = 0; + + h1[0] = h0[0] = m512_const1_128( iv[0] ); + h1[1] = h0[1] = m512_const1_128( iv[1] ); + h1[2] = h0[2] = m512_const1_128( iv[2] ); + h1[3] = h0[3] = m512_const1_128( iv[3] ); + h1[4] = h0[4] = m512_const1_128( iv[4] ); + h1[5] = h0[5] = m512_const1_128( iv[5] ); + h1[6] = h0[6] = m512_const1_128( iv[6] ); + h1[7] = h0[7] = m512_const1_128( iv[7] ); + + const int len = size >> 4; + const __m512i *in0 = (__m512i*)data0; + const __m512i *in1 = (__m512i*)data1; + __m512i *hash0 = (__m512i*)output0; + __m512i *hash1 = (__m512i*)output1; + int i; + + for ( i = 0; i < len; i++ ) + { + sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] ); + sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] ); + sp->pos++; + if ( sp->pos == sp->blocksize ) + { + transform_4way_2buf( sp ); + sp->pos = 0; + } + } + + // pos is zero for 64 byte data, 1 for 80 byte data. + __m512i tmp = m512_const2_64( 0, 0x0000000000000080 ); + sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp ); + sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp ); + + transform_4way_2buf( sp ); + + tmp = m512_const2_64( 0x0000000100000000, 0 ); + sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp ); + sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp ); + + for ( i = 0; i < 10; ++i ) + transform_4way_2buf( sp ); + + memcpy( hash0, sp->h0, sp->hashlen<<6); + memcpy( hash1, sp->h1, sp->hashlen<<6); + + return 0; +} + int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ) @@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output, // 2 way 128 +// This isn't expected to be used with AVX512 so HW rotate intruction +// is assumed not avaiable. +// Use double buffering to optimize serial bit rotations. Full double +// buffering isn't practical because it needs twice as many registers +// with AVX2 having only half as many as AVX512. +#define ROL2( out0, out1, in0, in1, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi32( in0, c ); \ + __m256i t1 = _mm256_slli_epi32( in1, c ); \ + out0 = _mm256_srli_epi32( in0, 32-(c) ); \ + out1 = _mm256_srli_epi32( in1, 32-(c) ); \ + out0 = _mm256_or_si256( out0, t0 ); \ + out1 = _mm256_or_si256( out1, t1 ); \ +} + static void transform_2way( cube_2way_context *sp ) { int r; @@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp ) x7 = _mm256_add_epi32( x3, x7 ); y0 = x0; y1 = x1; - x0 = mm256_rol_32( x2, 7 ); - x1 = mm256_rol_32( x3, 7 ); - x2 = mm256_rol_32( y0, 7 ); - x3 = mm256_rol_32( y1, 7 ); + ROL2( x0, x1, x2, x3, 7 ); + ROL2( x2, x3, y0, y1, 7 ); x0 = _mm256_xor_si256( x0, x4 ); - x1 = _mm256_xor_si256( x1, x5 ); - x2 = _mm256_xor_si256( x2, x6 ); - x3 = _mm256_xor_si256( x3, x7 ); x4 = mm256_swap128_64( x4 ); - x5 = mm256_swap128_64( x5 ); - x6 = mm256_swap128_64( x6 ); - x7 = mm256_swap128_64( x7 ); - x4 = _mm256_add_epi32( x0, x4 ); - x5 = _mm256_add_epi32( x1, x5 ); - x6 = _mm256_add_epi32( x2, x6 ); - x7 = _mm256_add_epi32( x3, x7 ); - y0 = x0; - y1 = x2; - x0 = mm256_rol_32( x1, 11 ); - x1 = mm256_rol_32( y0, 11 ); - x2 = mm256_rol_32( x3, 11 ); - x3 = mm256_rol_32( y1, 11 ); - x0 = _mm256_xor_si256( x0, x4 ); x1 = _mm256_xor_si256( x1, x5 ); x2 = _mm256_xor_si256( x2, x6 ); + x5 = mm256_swap128_64( x5 ); x3 = _mm256_xor_si256( x3, x7 ); + x4 = _mm256_add_epi32( x0, x4 ); + x6 = mm256_swap128_64( x6 ); + y0 = x0; + x5 = _mm256_add_epi32( x1, x5 ); + x7 = mm256_swap128_64( x7 ); + x6 = _mm256_add_epi32( x2, x6 ); + y1 = x2; + ROL2( x0, x1, x1, y0, 11 ); + x7 = _mm256_add_epi32( x3, x7 ); + ROL2( x2, x3, x3, y1, 11 ); + x0 = _mm256_xor_si256( x0, x4 ); x4 = mm256_swap64_32( x4 ); + x1 = _mm256_xor_si256( x1, x5 ); x5 = mm256_swap64_32( x5 ); + x2 = _mm256_xor_si256( x2, x6 ); x6 = mm256_swap64_32( x6 ); + x3 = _mm256_xor_si256( x3, x7 ); x7 = mm256_swap64_32( x7 ); } diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h index 25df10e..a31ffde 100644 --- a/algo/cubehash/cube-hash-2way.h +++ b/algo/cubehash/cube-hash-2way.h @@ -17,41 +17,41 @@ struct _cube_4way_context int pos; } __attribute__ ((aligned (128))); +struct _cube_4way_2buf_context +{ + __m512i h0[8]; + __m512i h1[8]; + int hashlen; + int rounds; + int blocksize; + int pos; +} __attribute__ ((aligned (128))); + + typedef struct _cube_4way_context cube_4way_context; +typedef struct _cube_4way_2buf_context cube_4way_2buf_context; + int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds, - int blockbytes ); + int blockbytes ); + int cube_4way_update( cube_4way_context *sp, const void *data, size_t size ); + int cube_4way_close( cube_4way_context *sp, void *output ); + int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ); + int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, const void *data, size_t size ); -int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen, - const void *data, size_t size ); - -#define cube512_4way_init( sp ) cube_4way_update( sp, 512 ) -#define cube512_4way_update cube_4way_update -#define cube512_4way_update_close cube_4way_update -#define cube512_4way_close cube_4way_update -#define cube512_4way_full( sp, output, data, size ) \ - cube_4way_full( sp, output, 512, data, size ) -#define cube512_4x256_full( sp, output, data, size ) \ - cube_4x256_full( sp, output, 512, data, size ) - -#define cube256_4way_init( sp ) cube_4way_update( sp, 256 ) -#define cube256_4way_update cube_4way_update -#define cube256_4way_update_close cube_4way_update -#define cube256_4way_close cube_4way_update -#define cube256_4way_full( sp, output, data, size ) \ - cube_4way_full( sp, output, 256, data, size ) -#define cube256_4x256_full( sp, output, data, size ) \ - cube_4x256_full( sp, output, 256, data, size ) +int cube_4way_2buf_full( cube_4way_2buf_context *sp, + void *output0, void *output1, int hashbitlen, + const void *data0, const void *data1, size_t size ); #endif -// 2x128, 2 way parallel SSE2 +// 2x128, 2 way parallel AVX2 struct _cube_2way_context { diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index c87829d..5ea1b6f 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -31,10 +31,14 @@ static void transform( cubehashParam *sp ) for ( r = 0; r < rounds; ++r ) { x1 = _mm512_add_epi32( x0, x1 ); - x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 ); - x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) ); - x0 = _mm512_xor_si512( mm512_rol_32( - mm512_swap256_128( x0 ), 11 ), x1 ); + x0 = mm512_swap_256( x0 ); + x0 = mm512_rol_32( x0, 7 ); + x0 = _mm512_xor_si512( x0, x1 ); + x1 = mm512_swap128_64( x1 ); + x1 = _mm512_add_epi32( x0, x1 ); + x0 = mm512_swap256_128( x0 ); + x0 = mm512_rol_32( x0, 11 ); + x0 = _mm512_xor_si512( x0, x1 ); x1 = mm512_swap64_32( x1 ); } diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index 595dc3d..b76d809 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -43,7 +43,8 @@ #define ROUNDS (ROUNDS1024) //#endif -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +#define ROTL64(a,n) rol64( a, n ) #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index 9410266..32ce1a5 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -63,7 +63,8 @@ typedef crypto_uint64 u64; //#define ROUNDS (ROUNDS1024) //#endif -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) +#define ROTL64(a,n) rol64( a, n ) #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c index 5a67303..4f17c64 100644 --- a/algo/groestl/myr-groestl.c +++ b/algo/groestl/myr-groestl.c @@ -11,7 +11,7 @@ #else #include "sph_groestl.h" #endif -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" typedef struct { #ifdef __AES__ @@ -19,7 +19,6 @@ typedef struct { #else sph_groestl512_context groestl; #endif - sph_sha256_context sha; } myrgr_ctx_holder; myrgr_ctx_holder myrgr_ctx; @@ -31,7 +30,6 @@ void init_myrgr_ctx() #else sph_groestl512_init( &myrgr_ctx.groestl ); #endif - sph_sha256_init( &myrgr_ctx.sha ); } void myriad_hash(void *output, const void *input) @@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input) sph_groestl512_close(&ctx.groestl, hash); #endif - sph_sha256( &ctx.sha, hash, 64 ); - sph_sha256_close( &ctx.sha, hash ); + sha256_full( hash, hash, 64 ); memcpy(output, hash, 32); } diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 9944ebe..26e133c 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -632,26 +632,25 @@ do { \ } while (0) -#define ROUND_BIG8(rc, alpha) \ +#define ROUND_BIG8( alpha ) \ do { \ __m512i t0, t1, t2, t3; \ - s0 = _mm512_xor_si512( s0, m512_const1_64( \ - ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ - s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ - s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ - s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ - s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ - s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ - s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ - s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ - s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ - s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ - sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ - sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ - sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ - sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ - sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ - sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ + s0 = _mm512_xor_si512( s0, alpha[ 0] ); \ + s1 = _mm512_xor_si512( s1, alpha[ 1] ); \ + s2 = _mm512_xor_si512( s2, alpha[ 2] ); \ + s3 = _mm512_xor_si512( s3, alpha[ 3] ); \ + s4 = _mm512_xor_si512( s4, alpha[ 4] ); \ + s5 = _mm512_xor_si512( s5, alpha[ 5] ); \ + s6 = _mm512_xor_si512( s6, alpha[ 6] ); \ + s7 = _mm512_xor_si512( s7, alpha[ 7] ); \ + s8 = _mm512_xor_si512( s8, alpha[ 8] ); \ + s9 = _mm512_xor_si512( s9, alpha[ 9] ); \ + sA = _mm512_xor_si512( sA, alpha[10] ); \ + sB = _mm512_xor_si512( sB, alpha[11] ); \ + sC = _mm512_xor_si512( sC, alpha[12] ); \ + sD = _mm512_xor_si512( sD, alpha[13] ); \ + sE = _mm512_xor_si512( sE, alpha[14] ); \ + sF = _mm512_xor_si512( sF, alpha[15] ); \ \ SBOX8( s0, s4, s8, sC ); \ SBOX8( s1, s5, s9, sD ); \ @@ -731,28 +730,66 @@ do { \ #define P_BIG8 \ do { \ - ROUND_BIG8(0, alpha_n); \ - ROUND_BIG8(1, alpha_n); \ - ROUND_BIG8(2, alpha_n); \ - ROUND_BIG8(3, alpha_n); \ - ROUND_BIG8(4, alpha_n); \ - ROUND_BIG8(5, alpha_n); \ + __m512i alpha[16]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG8( alpha ); \ } while (0) #define PF_BIG8 \ do { \ - ROUND_BIG8( 0, alpha_f); \ - ROUND_BIG8( 1, alpha_f); \ - ROUND_BIG8( 2, alpha_f); \ - ROUND_BIG8( 3, alpha_f); \ - ROUND_BIG8( 4, alpha_f); \ - ROUND_BIG8( 5, alpha_f); \ - ROUND_BIG8( 6, alpha_f); \ - ROUND_BIG8( 7, alpha_f); \ - ROUND_BIG8( 8, alpha_f); \ - ROUND_BIG8( 9, alpha_f); \ - ROUND_BIG8(10, alpha_f); \ - ROUND_BIG8(11, alpha_f); \ + __m512i alpha[16]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ + alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG8( alpha ); \ } while (0) #define T_BIG8 \ @@ -965,26 +1002,25 @@ do { \ #define sF m7 */ -#define ROUND_BIG(rc, alpha) \ +#define ROUND_BIG( alpha ) \ do { \ __m256i t0, t1, t2, t3; \ - s0 = _mm256_xor_si256( s0, m256_const1_64( \ - ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ - s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ - s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ - s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ - s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ - s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ - s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ - s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ - s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ - s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ - sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ - sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ - sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ - sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ - sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ - sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ + s0 = _mm256_xor_si256( s0, alpha[ 0] ); \ + s1 = _mm256_xor_si256( s1, alpha[ 1] ); \ + s2 = _mm256_xor_si256( s2, alpha[ 2] ); \ + s3 = _mm256_xor_si256( s3, alpha[ 3] ); \ + s4 = _mm256_xor_si256( s4, alpha[ 4] ); \ + s5 = _mm256_xor_si256( s5, alpha[ 5] ); \ + s6 = _mm256_xor_si256( s6, alpha[ 6] ); \ + s7 = _mm256_xor_si256( s7, alpha[ 7] ); \ + s8 = _mm256_xor_si256( s8, alpha[ 8] ); \ + s9 = _mm256_xor_si256( s9, alpha[ 9] ); \ + sA = _mm256_xor_si256( sA, alpha[10] ); \ + sB = _mm256_xor_si256( sB, alpha[11] ); \ + sC = _mm256_xor_si256( sC, alpha[12] ); \ + sD = _mm256_xor_si256( sD, alpha[13] ); \ + sE = _mm256_xor_si256( sE, alpha[14] ); \ + sF = _mm256_xor_si256( sF, alpha[15] ); \ \ SBOX( s0, s4, s8, sC ); \ SBOX( s1, s5, s9, sD ); \ @@ -1064,28 +1100,66 @@ do { \ #define P_BIG \ do { \ - ROUND_BIG(0, alpha_n); \ - ROUND_BIG(1, alpha_n); \ - ROUND_BIG(2, alpha_n); \ - ROUND_BIG(3, alpha_n); \ - ROUND_BIG(4, alpha_n); \ - ROUND_BIG(5, alpha_n); \ + __m256i alpha[16]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ + ^ ( (uint64_t*)alpha_n )[0] ); \ + ROUND_BIG( alpha ); \ } while (0) #define PF_BIG \ do { \ - ROUND_BIG( 0, alpha_f); \ - ROUND_BIG( 1, alpha_f); \ - ROUND_BIG( 2, alpha_f); \ - ROUND_BIG( 3, alpha_f); \ - ROUND_BIG( 4, alpha_f); \ - ROUND_BIG( 5, alpha_f); \ - ROUND_BIG( 6, alpha_f); \ - ROUND_BIG( 7, alpha_f); \ - ROUND_BIG( 8, alpha_f); \ - ROUND_BIG( 9, alpha_f); \ - ROUND_BIG(10, alpha_f); \ - ROUND_BIG(11, alpha_f); \ + __m256i alpha[16]; \ + for( int i = 0; i < 16; i++ ) \ + alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ + alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \ + ^ ( (uint64_t*)alpha_f )[0] ); \ + ROUND_BIG( alpha ); \ } while (0) #define T_BIG \ diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c index 6ff6175..7ce79da 100644 --- a/algo/hodl/hodl-wolf.c +++ b/algo/hodl/hodl-wolf.c @@ -7,6 +7,7 @@ #include "hodl-gate.h" #include "hodl-wolf.h" #include "miner.h" +#include "algo/sha/sha256d.h" #if defined(__AES__) diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 282ae91..c710836 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -1,5 +1,6 @@ #include "keccak-gate.h" #include "sph_keccak.h" +#include "algo/sha/sha256d.h" int hard_coded_eb = 1; diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index e2545b4..af37d6f 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -70,13 +70,13 @@ static const uint64_t RC[] = { // Targetted macros, keccak-macros.h is included for each target. -#define DECL64(x) __m512i x -#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) -#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) -#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) -#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) -#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) -#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) +#define DECL64(x) __m512i x +#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) +#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) +#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) +#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) +#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 833b87e..f15648a 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -16,7 +16,7 @@ typedef struct { blake256_16way_context blake; keccak256_8way_context keccak; - cube_4way_context cube; + cube_4way_2buf_context cube; skein256_8way_context skein; #if defined(__VAES__) groestl256_4way_context groestl; @@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx; bool init_allium_16way_ctx() { keccak256_8way_init( &allium_16way_ctx.keccak ); - cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 ); skein256_8way_init( &allium_16way_ctx.skein ); -#if defined(__VAES__) - groestl256_4way_init( &allium_16way_ctx.groestl, 32 ); -#else - init_groestl256( &allium_16way_ctx.groestl, 32 ); -#endif return true; } @@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input ) intrlv_2x256( vhash, hash14, hash15, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash14, hash15, vhash, 256 ); - + intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 ); - cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); - cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 ); dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 ); @@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input ) intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 ); intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 ); - cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); - cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 ); dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 ); dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 ); @@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, typedef struct { blake256_8way_context blake; keccak256_4way_context keccak; - cubehashParam cube; + cube_2way_context cube; skein256_4way_context skein; #if defined(__VAES__) groestl256_2way_context groestl; @@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx; bool init_allium_8way_ctx() { keccak256_4way_init( &allium_8way_ctx.keccak ); - cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 ); skein256_4way_init( &allium_8way_ctx.skein ); -#if defined(__VAES__) - groestl256_2way_init( &allium_8way_ctx.groestl, 32 ); -#else - init_groestl256( &allium_8way_ctx.groestl, 32 ); -#endif return true; } @@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input ) LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 ); + + intrlv_2x128( vhashA, hash0, hash1, 256 ); + intrlv_2x128( vhashB, hash2, hash3, 256 ); + cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); + cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + dintrlv_2x128( hash0, hash1, vhashA, 256 ); + dintrlv_2x128( hash2, hash3, vhashB, 256 ); + + intrlv_2x128( vhashA, hash4, hash5, 256 ); + intrlv_2x128( vhashB, hash6, hash7, 256 ); + cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 ); + cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 ); + dintrlv_2x128( hash4, hash5, vhashA, 256 ); + dintrlv_2x128( hash6, hash7, vhashB, 256 ); LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index b24b173..1c90444 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ G2W_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_ror256_64( s1); \ + s3 = mm512_shufll256_64( s3 ); \ + s1 = mm512_shuflr256_64( s1); \ s2 = mm512_swap256_128( s2 ); \ - s3 = mm512_rol256_64( s3 ); \ G2W_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_rol256_64( s1 ); \ - s2 = mm512_swap256_128( s2 ); \ - s3 = mm512_ror256_64( s3 ); + s3 = mm512_shuflr256_64( s3 ); \ + s1 = mm512_shufll256_64( s1 ); \ + s2 = mm512_swap256_128( s2 ); #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \ LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ @@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_ror_1x64( s1); \ + s3 = mm256_shufll_64( s3 ); \ + s1 = mm256_shuflr_64( s1); \ s2 = mm256_swap_128( s2 ); \ - s3 = mm256_rol_1x64( s3 ); \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_rol_1x64( s1 ); \ - s2 = mm256_swap_128( s2 ); \ - s3 = mm256_ror_1x64( s3 ); + s3 = mm256_shuflr_64( s3 ); \ + s1 = mm256_shufll_64( s1 ); \ + s2 = mm256_swap_128( s2 ); #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ @@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_ror256_64( s2, s3 ); \ + mm128_vrol256_64( s6, s7 ); \ + mm128_vror256_64( s2, s3 ); \ mm128_swap256_128( s4, s5 ); \ - mm128_rol256_64( s6, s7 ); \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_rol256_64( s2, s3 ); \ - mm128_swap256_128( s4, s5 ); \ - mm128_ror256_64( s6, s7 ); + mm128_vror256_64( s6, s7 ); \ + mm128_vrol256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c index ab13a7e..2bf4a11 100644 --- a/algo/m7m/m7m.c +++ b/algo/m7m/m7m.c @@ -13,6 +13,7 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/ripemd/sph_ripemd.h" #include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #define EPSa DBL_EPSILON #define EPS1 DBL_EPSILON @@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce ) } typedef struct { - sph_sha256_context sha256; - sph_sha512_context sha512; + sha256_context sha256; + sph_sha512_context sha512; sph_keccak512_context keccak; sph_whirlpool_context whirlpool; sph_haval256_5_context haval; @@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx; void init_m7m_ctx() { - sph_sha256_init( &m7m_ctx ); + sha256_ctx_init( &m7m_ctx.sha256 ); sph_sha512_init( &m7m_ctx.sha512 ); sph_keccak512_init( &m7m_ctx.keccak ); sph_whirlpool_init( &m7m_ctx.whirlpool ); @@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64))); memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) ); - sph_sha256_context ctxf_sha256; memcpy(data, pdata, 80); - sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN ); + sha256_update( &ctx1.sha256, data, M7_MIDSTATE_LEN ); sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN ); sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN ); sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN ); @@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) ); - sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN ); - sph_sha256_close( &ctx2.sha256, bhash[0] ); + sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN ); + sha256_final( &ctx2.sha256, bhash[0] ); sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN ); sph_sha512_close( &ctx2.sha512, bhash[1] ); @@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, bytes = mpz_sizeinbase(product, 256); mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product); - sph_sha256_init( &ctxf_sha256 ); - sph_sha256( &ctxf_sha256, bdata, bytes ); - sph_sha256_close( &ctxf_sha256, hash ); + sha256_full( hash, bdata, bytes ); digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75); mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16); @@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, mpzscale=bytes; mpz_export(bdata, NULL, -1, 1, 0, 0, product); - sph_sha256_init( &ctxf_sha256 ); - sph_sha256( &ctxf_sha256, bdata, bytes ); - sph_sha256_close( &ctxf_sha256, hash ); - } + sha256_full( hash, bdata, bytes ); + } if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) && !opt_benchmark ) ) diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c index 94f3417..e91b287 100644 --- a/algo/ripemd/lbry.c +++ b/algo/ripemd/lbry.c @@ -7,24 +7,19 @@ #include #include #include "sph_ripemd.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" void lbry_hash(void* output, const void* input) { - sph_sha256_context ctx_sha256 __attribute__ ((aligned (64))); + sha256_context ctx_sha256 __attribute__ ((aligned (64))); sph_sha512_context ctx_sha512 __attribute__ ((aligned (64))); sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64))); uint32_t _ALIGN(64) hashA[16]; uint32_t _ALIGN(64) hashB[16]; uint32_t _ALIGN(64) hashC[16]; - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, input, 112 ); - sph_sha256_close( &ctx_sha256, hashA ); - - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashA, 32 ); - sph_sha256_close( &ctx_sha256, hashA ); + sha256_full( hashA, input, 112 ); + sha256_full( hashA, hashA, 32 ); sph_sha512_init( &ctx_sha512 ); sph_sha512( &ctx_sha512, hashA, 32 ); @@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input) sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 ); sph_ripemd160_close( &ctx_ripemd, hashC ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashB, 20 ); - sph_sha256( &ctx_sha256, hashC, 20 ); - sph_sha256_close( &ctx_sha256, hashA ); - - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hashA, 32 ); - sph_sha256_close( &ctx_sha256, hashA ); + sha256_ctx_init( &ctx_sha256 ); + sha256_update( &ctx_sha256, hashB, 20 ); + sha256_update( &ctx_sha256, hashC, 20 ); + sha256_final( &ctx_sha256, hashA ); + sha256_full( hashA, hashA, 32 ); + memcpy( output, hashA, 32 ); } diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c index 7cb4c82..709b268 100644 --- a/algo/scrypt/neoscrypt.c +++ b/algo/scrypt/neoscrypt.c @@ -69,8 +69,12 @@ typedef unsigned int uint; #define SCRYPT_HASH_BLOCK_SIZE 64U #define SCRYPT_HASH_DIGEST_SIZE 32U -#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) -#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) +//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) +//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + +#define ROTL32(a,b) rol32(a,b) +#define ROTR32(a,b) ror32(a,b) + #define U8TO32_BE(p) \ (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c new file mode 100644 index 0000000..19ff9cd --- /dev/null +++ b/algo/scrypt/scrypt-core-4way.c @@ -0,0 +1,3981 @@ +#include "scrypt-core-4way.h" + +////////////////////////////////////////////////////////////////////////// +// +// Optimized Salsa implementation inspired by Pooler. +// Any similarities are not a coincidence. +// +// Implementations include reference X64, SSE2, AVX2 & AVX512 +// using both serial and parallel vectoring using SIMD instruction. +// +// Generic macros are providedi and invoked with different targets depending +// on level of parallelism and data organization. Targets for any macros +// needed must be defined by the calling function. XOR, ROL32 and ADD32 +// are needed in all cases. Additionally ROL_1X32, SWAP_64 and ROR_1X32 +// shuffles are needed for serial SIMD. +// +// SALSA_8ROUNDS_SIMD uses vectors on serial data rather than traditional +// n-way parallel hashing. +// The SIMD version has different implied arguments {X0:X3}, representing +// an array of 4 vectors of 4 32 bit words, while the version used for +// regular parallel hashing has {x0:xf} representing array of 16 by 32 bit +// words. +// These arguments must be defined by the calling function. +// The calling function must also define targets for all macros used for +// arithmetic, logic and shuffling: XOR, ROL32, ADD32 for all targets and +// ROL_1X32, SWAP_64, ROR_1X32 for serial SIMD targets. +// +// Serial and parallel SIMD will be combined with AVX2 doing 2 way +// parallel over 4 way linear for 8 way throughput, and AVX512 doing +// 4 way parallel over 4 way linear for 16 way thoughput. +// +// The term SIMD128 here refers to vectors that contain multiple contiguous +// data from a single stream (lane) as opposed to parallel vectors that +// contain interleaved words of data from multiple streams. +// +// The sequencing of techniques in the naming convention is a little +// mixed up. The logical hierarchy top down is to put Nbuffs at the top +// where each buffer then performs another technique. +// +// Although, Nway and SIMS128 are listed in top down order Nbuffs is +// always listed last: +// +// scrypt_core_simd128_2way means a linear simd operation on 2 parallel +// streams of data while +// scrypt_core_2way_simd128 is 2 parallel streams linear SIMD vectors. +// +/////////////////////////////////////////////////////////////////////////// + + +// Used by all targets, needs XOR, ROL32 & ADD32 macros defined +// Function, return typically overwrites in1 +// +#define ARX( in1, in2, in3, n ) \ + XOR( in1, ROL32( ADD32( in2, in3 ), n ) ) + +// Multi buffering has 2 main benefits and one drawback. +// Traditionally double buffering has been used to empty one bucket +// while another is filling. This requires a second (or 3rd, etc) +// bucket. The computing analogy is to use 2 registers, 1 to read +// and 1 to write, and switch back and forth. +// +// The second benefit in computing is using multiple registers to +// provide data independence that improves multiple instruction issue and +// pipelining in the CPU. The number of buffers is limited by the number +// of registers available. Three seems to be a swet spot as a 4 variable +// data set uses 12 registers triple buffered, leaving 4 of 16 as temps. +// Many pipelined instructions require 3 clocks to complete and triple +// bufferin keeps the pipeline full. Many execution units are also 3 wide +// allowing up to 3 similar instructions to be issued per clock. +// However, execution units are shared by hyperthreading which reduces +// the effect on a single thread. +// +// The drawback is the increased size of the data. Although multi buffering +// also improves memory throughput this is offset by the amount of +// memory required and it's effect on cache performance and will eventually +// hit memory bus saturation. +// +// For example scryptn2 struggles with more than 4 buffers, multi +// buffered and parallel SIMD combined, and performance drops. This can +// be mitigated somewhat by reducing the number of CPU threads but +// ultimately excessive multi buffering has a negative impact. +// +// Unlike paralle SIMD, increasing multi buffering does not require a +// CPU technology increase, ie SSE2 to AVX2 or AVX2 TO AVX512. +// SSE2 is limited to 4 way SIMD but no theoretical limit to multibuffering. +// Multi buffering also does not suffer the clock penalty of increasing +// parallism. +// +// Multi buffering implementations here focus on powers of 2, +// to match sha256 without re-interleaving the data. +// +// A decision will have to be made at run time, based of the N factor, +// whether to use multi buffering or serial execution. + +// Need TYPE macro defined. +#define ARX_2BUF( a1, a2, a3, b1, b2, b3, n ) \ +do{ \ + TYPE ta = ADD32( a2, a3 ); \ + TYPE tb = ADD32( b2, b3 ); \ + ta = ROL32( ta, n ); \ + tb = ROL32( tb, n ); \ + a1 = XOR( a1, ta ); \ + b1 = XOR( b1, tb ); \ +} while (0); + +#define ARX_3BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, n ) \ +do{ \ + TYPE ta = ADD32( a2, a3 ); \ + TYPE tb = ADD32( b2, b3 ); \ + TYPE tc = ADD32( c2, c3 ); \ + ta = ROL32( ta, n ); \ + tb = ROL32( tb, n ); \ + tc = ROL32( tc, n ); \ + a1 = XOR( a1, ta ); \ + b1 = XOR( b1, tb ); \ + c1 = XOR( c1, tc ); \ +} while (0); + +// use 16 regs AVX, AVX2, 8 buf for AVX512? +#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \ +do{ \ + TYPE ta = ADD32( a2, a3 ); \ + TYPE tb = ADD32( b2, b3 ); \ + TYPE tc = ADD32( c2, c3 ); \ + TYPE td = ADD32( d2, d3 ); \ + ta = ROL32( ta, n ); \ + tb = ROL32( tb, n ); \ + tc = ROL32( tc, n ); \ + td = ROL32( td, n ); \ + a1 = XOR( a1, ta ); \ + b1 = XOR( b1, tb ); \ + c1 = XOR( c1, tc ); \ + d1 = XOR( d1, td ); \ +} while (0); + + +// Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 & +// ROR_1X32 defined. +// +// Implied arguments ( X0 = { x3, x2, x1, x0 }, +// X1 = { x7, x6, x5, x4 }, +// X3 = { xb, xa, x9, x8 }, +// X3 = { xf, xe, xd, xc } ) +// +#define SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ) \ + /* Operate on columns */ \ + X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \ + X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \ + X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \ + X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \ + /* Rearrange data */ \ + X1 = ROL_1X32( X1 ); \ + X3 = ROR_1X32( X3 ); \ + X2 = SWAP_64( X2 ); \ + /* Operate on rows */ \ + X3 = ARX( X3, X0, X1, 7 ); \ + X2 = ARX( X2, X3, X0, 9 ); \ + X1 = ARX( X1, X2, X3, 13 ); \ + X0 = ARX( X0, X1, X2, 18 ); \ + /* Rearrange data */ \ + X3 = ROL_1X32( X3 ); \ + X1 = ROR_1X32( X1 ); \ + X2 = SWAP_64( X2 ); \ + +// Final round optimization, don't rearange data back to original order on exit +// Used only on pre-AVX2 CPUs where blend instruction is not avaiable. +// It saves a few redundant shuffles. +#define SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ) \ + /* Operate on columns */ \ + X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \ + X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \ + X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \ + X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \ + /* Rearrange data */ \ + X1 = ROL_1X32( X1 ); \ + X3 = ROR_1X32( X3 ); \ + X2 = SWAP_64( X2 ); \ + /* Operate on rows */ \ + X3 = ARX( X3, X0, X1, 7 ); \ + X2 = ARX( X2, X3, X0, 9 ); \ + X1 = ARX( X1, X2, X3, 13 ); \ + X0 = ARX( X0, X1, X2, 18 ); \ + /* Final round, don't rearrange data + X1 = ROR_1X32( X1 ); \ + X2 = SWAP_64( X2 ); \ + X3 = ROL_1X32( X3 ); */ + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 ) +#define SALSA_2ROUNDS_SIMD128_2BUF \ + ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ + ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ + ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ + ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ + ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ + ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ + ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); + +#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \ + ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ + ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ + ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ + ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ + ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ + ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ + ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, +// XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 ) +#define SALSA_2ROUNDS_SIMD128_4BUF \ + ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \ + ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, XD2, XD1, XD0, 9 ); \ + ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \ + ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XD1 = ROL_1X32( XD1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XD3 = ROR_1X32( XD3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + XD2 = SWAP_64( XD2 ); \ + ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, XD3, XD0, XD1, 7 ); \ + ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, XD2, XD3, XD0, 9 ); \ + ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \ + ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XC3 = ROL_1X32( XC3 ); \ + XD3 = ROL_1X32( XD3 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC1 = ROR_1X32( XC1 ); \ + XD1 = ROR_1X32( XD1 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + XD2 = SWAP_64( XD2 ); + +#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \ + ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \ + ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, XD2, XD1, XD0, 9 ); \ + ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \ + ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XD1 = ROL_1X32( XD1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XD3 = ROR_1X32( XD3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + XD2 = SWAP_64( XD2 ); \ + ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, XD3, XD0, XD1, 7 ); \ + ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, XD2, XD3, XD0, 9 ); \ + ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \ + ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, XD0, XD1, XD2, 18 ); + +// Inlined ARX +#define SALSA_2ROUNDS_SIMD128_3BUF \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE TC = ADD32( XC0, XC3 ); \ + TA = ROL32( TA, 7 ); \ + TB = ROL32( TB, 7 ); \ + TC = ROL32( TC, 7 ); \ + XA1 = XOR( XA1, TA ); \ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + TC = ADD32( XC1, XC0 ); \ + TA = ROL32( TA, 9 ); \ + TB = ROL32( TB, 9 ); \ + TC = ROL32( TC, 9 ); \ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + TC = ADD32( XC2, XC1 ); \ + TA = ROL32( TA, 13 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = XOR( XA3, TA ); \ + TB = ROL32( TB, 13 ); \ + XB3 = XOR( XB3, TB ); \ + TC = ROL32( TC, 13 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + TC = ADD32( XC3, XC2 ); \ + TA = ROL32( TA, 18 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + XA0 = XOR( XA0, TA ); \ + TB = ROL32( TB, 18 ); \ + XB0 = XOR( XB0, TB ); \ + TC = ROL32( TC, 18 ); \ + XC0 = XOR( XC0, TC ); \ +\ + TA = ADD32( XA0, XA1 ); \ + TB = ADD32( XB0, XB1 ); \ + TC = ADD32( XC0, XC1 ); \ + TA = ROL32( TA, 7 ); \ + XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, TA ); \ + TB = ROL32( TB, 7 ); \ + XB3 = ROR_1X32( XB3 ); \ + XB3 = XOR( XB3, TB ); \ + TC = ROL32( TC, 7 ); \ + XC3 = ROR_1X32( XC3 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + TC = ADD32( XC3, XC0 ); \ + TA = ROL32( TA, 9 ); \ + TB = ROL32( TB, 9 ); \ + TC = ROL32( TC, 9 ); \ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + TA = ROL32( TA, 13 ); \ + TC = ADD32( XC2, XC3 ); \ + XA3 = ROL_1X32( XA3 ); \ + TB = ROL32( TB, 13 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = XOR( XA1, TA ); \ + TC = ROL32( TC, 13 ); \ + XC3 = ROL_1X32( XC3 ); \ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + TA = ROL32( TA, 18); \ + TC = ADD32( XC1, XC2 ); \ + XA2 = SWAP_64( XA2 ); \ + TB = ROL32( TB, 18); \ + XA0 = XOR( XA0, TA ); \ + XB2 = SWAP_64( XB2 ); \ + TC = ROL32( TC, 18); \ + XB0 = XOR( XB0, TB ); \ + XC2 = SWAP_64( XC2 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC0 = XOR( XC0, TC ); \ + XC1 = ROR_1X32( XC1 ); \ +} while (0); + + +// slow rol, an attempt to optimze non-avx512 bit rotations +#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE TC = ADD32( XC0, XC3 ); \ + TYPE T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA1 = XOR( XA1, T ); \ + T = _mm_slli_epi32( TB, 7 );\ + XA1 = XOR( XA1, TA ); \ + TB = _mm_srli_epi32( TB, 25 ); \ + XB1 = XOR( XB1, T ); \ + T = _mm_slli_epi32( TC, 7 );\ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, T ); \ + TC = _mm_srli_epi32( TC, 25 );\ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + TC = ADD32( XC1, XC0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + T = _mm_slli_epi32( TB, 9 );\ + TB = _mm_srli_epi32( TB, 23 );\ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, T ); \ + T = _mm_slli_epi32( TC, 9 );\ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, T ); \ + TC = _mm_srli_epi32( TC, 23 );\ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + TC = ADD32( XC2, XC1 ); \ + T = _mm_slli_epi32( TA, 13); \ + TA = _mm_srli_epi32( TA, 19 ); \ + XA1 = ROL_1X32( XA1 ); \ + XA3 = XOR( XA3, T ); \ + XB1 = ROL_1X32( XB1 ); \ + T = _mm_slli_epi32( TB, 13); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XA3 = XOR( XA3, TA ); \ + XB3 = XOR( XB3, T ); \ + T = _mm_slli_epi32( TC, 13); \ + TC = _mm_srli_epi32( TC, 19 ); \ + XB3 = XOR( XB3, TB ); \ + XC3 = XOR( XC3, T ); \ + XC1 = ROL_1X32( XC1 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + TC = ADD32( XC3, XC2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ + XA2 = SWAP_64( XA2 ); \ + XA0 = XOR( XA0, T ); \ + T = _mm_slli_epi32( TB, 18 ); \ + XB2 = SWAP_64( XB2 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XB0 = XOR( XB0, T ); \ + T = _mm_slli_epi32( TC, 18 ); \ + XA0 = XOR( XA0, TA ); \ + TC = _mm_srli_epi32( TC, 14 ); \ + XC0 = XOR( XC0, T ); \ + XB0 = XOR( XB0, TB ); \ + XC2 = SWAP_64( XC2 ); \ + XC0 = XOR( XC0, TC ); \ +\ + TA = ADD32( XA0, XA1 ); \ + TB = ADD32( XB0, XB1 ); \ + TC = ADD32( XC0, XC1 ); \ + TA = ROL32( TA, 7 ); \ + XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, TA ); \ + TB = ROL32( TB, 7 ); \ + XB3 = ROR_1X32( XB3 ); \ + XB3 = XOR( XB3, TB ); \ + TC = ROL32( TC, 7 ); \ + XC3 = ROR_1X32( XC3 ); \ + XC3 = XOR( XC3, TC ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + TC = ADD32( XC3, XC0 ); \ + TA = ROL32( TA, 9 ); \ + TB = ROL32( TB, 9 ); \ + TC = ROL32( TC, 9 ); \ + XA2 = XOR( XA2, TA ); \ + XB2 = XOR( XB2, TB ); \ + XC2 = XOR( XC2, TC ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + TA = ROL32( TA, 13 ); \ + TC = ADD32( XC2, XC3 ); \ + XA3 = ROL_1X32( XA3 ); \ + TB = ROL32( TB, 13 ); \ + XB3 = ROL_1X32( XB3 ); \ + XA1 = XOR( XA1, TA ); \ + TC = ROL32( TC, 13 ); \ + XC3 = ROL_1X32( XC3 ); \ + XB1 = XOR( XB1, TB ); \ + XC1 = XOR( XC1, TC ); \ +\ + TA = ADD32( XA1, XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + TA = ROL32( TA, 18); \ + TC = ADD32( XC1, XC2 ); \ + XA2 = SWAP_64( XA2 ); \ + TB = ROL32( TB, 18); \ + XA0 = XOR( XA0, TA ); \ + XB2 = SWAP_64( XB2 ); \ + TC = ROL32( TC, 18); \ + XB0 = XOR( XB0, TB ); \ + XC2 = SWAP_64( XC2 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC0 = XOR( XC0, TC ); \ + XC1 = ROR_1X32( XC1 ); \ +} while (0); + + +/* +// Standard version using ARX +#define SALSA_2ROUNDS_SIMD128_3BUF \ + ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, 7 ); \ + ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, 9 ); \ + ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, 13 ); \ + ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, 7 ); \ + ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, 9 ); \ + ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, 13 ); \ + ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, 18 ); \ + XA3 = ROL_1X32( XA3 ); \ + XB3 = ROL_1X32( XB3 ); \ + XC3 = ROL_1X32( XC3 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB1 = ROR_1X32( XB1 ); \ + XC1 = ROR_1X32( XC1 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); +*/ + +#define SALSA_2ROUNDS_FINAL_SIMD128_3BUF \ + ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ + XC1, XC0, XC3, 7 ); \ + ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ + XC2, XC1, XC0, 9 ); \ + ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ + XC3, XC2, XC1, 13 ); \ + ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ + XC0, XC3, XC2, 18 ); \ + XA1 = ROL_1X32( XA1 ); \ + XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = ROR_1X32( XA3 ); \ + XB3 = ROR_1X32( XB3 ); \ + XC3 = ROR_1X32( XC3 ); \ + XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ + ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ + XC3, XC0, XC1, 7 ); \ + ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ + XC2, XC3, XC0, 9 ); \ + ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ + XC1, XC2, XC3, 13 ); \ + ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ + XC0, XC1, XC2, 18 ); + + +#define SALSA_8ROUNDS_SIMD128 \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); + +#define SALSA_8ROUNDS_FINAL_SIMD128 \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \ + SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ); + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 ) +#define SALSA_8ROUNDS_SIMD128_2BUF \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; + +#define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_SIMD128_2BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_2BUF; + +#define SALSA_8ROUNDS_SIMD128_3BUF \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; + +#define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_SIMD128_3BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_3BUF; + +// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, +// XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3, ) +#define SALSA_8ROUNDS_SIMD128_4BUF \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; + +#define SALSA_8ROUNDS_FINAL_SIMD128_4BUF \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_SIMD128_4BUF; \ + SALSA_2ROUNDS_FINAL_SIMD128_4BUF; + +// Used by reference code and pure parallel implementations +// +// Implied arguments ( x0, x1, x2, x3, x4, x5, x6, x7, +// x8, x9, xa, xb, xc, xd, xe, xf ) +// +#define SALSA_COLUMN \ + x4 = ARX( x4, x0, xc, 7 ); \ + x9 = ARX( x9, x5, x1, 7 ); \ + xe = ARX( xe, xa, x6, 7 ); \ + x3 = ARX( x3, xf, xb, 7 ); \ + x8 = ARX( x8, x4, x0, 9 ); \ + xd = ARX( xd, x9, x5, 9 ); \ + x2 = ARX( x2, xe, xa, 9 ); \ + x7 = ARX( x7, x3, xf, 9 ); \ + xc = ARX( xc, x8, x4, 13 ); \ + x1 = ARX( x1, xd, x9, 13 ); \ + x6 = ARX( x6, x2, xe, 13 ); \ + xb = ARX( xb, x7, x3, 13 ); \ + x0 = ARX( x0, xc, x8, 18 ); \ + x5 = ARX( x5, x1, xd, 18 ); \ + xa = ARX( xa, x6, x2, 18 ); \ + xf = ARX( xf, xb, x7, 18 ) + +#define SALSA_ROW \ + x1 = ARX( x1, x0, x3, 7 ); \ + x6 = ARX( x6, x5, x4, 7 ); \ + xb = ARX( xb, xa, x9, 7 ); \ + xc = ARX( xc, xf, xe, 7 ); \ + x2 = ARX( x2, x1, x0, 9 ); \ + x7 = ARX( x7, x6, x5, 9 ); \ + x8 = ARX( x8, xb, xa, 9 ); \ + xd = ARX( xd, xc, xf, 9 ); \ + x3 = ARX( x3, x2, x1, 13 ); \ + x4 = ARX( x4, x7, x6, 13 ); \ + x9 = ARX( x9, x8, xb, 13 ); \ + xe = ARX( xe, xd, xc, 13 ); \ + x0 = ARX( x0, x3, x2, 18 ); \ + x5 = ARX( x5, x4, x7, 18 ); \ + xa = ARX( xa, x9, x8, 18 ); \ + xf = ARX( xf, xe, xd, 18 ); + +#define SALSA_2ROUNDS SALSA_COLUMN; SALSA_ROW; + +#define SALSA_8ROUNDS \ + SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// Tested OK but very slow +// 16 way parallel, requires 16x32 interleaving +static void xor_salsa8_16way( __m512i * const B, const __m512i * const C) +{ + __m512i x0 = B[ 0] = _mm512_xor_si512( B[ 0], C[ 0] ); + __m512i x1 = B[ 1] = _mm512_xor_si512( B[ 1], C[ 1] ); + __m512i x2 = B[ 2] = _mm512_xor_si512( B[ 2], C[ 2] ); + __m512i x3 = B[ 3] = _mm512_xor_si512( B[ 3], C[ 3] ); + __m512i x4 = B[ 4] = _mm512_xor_si512( B[ 4], C[ 4] ); + __m512i x5 = B[ 5] = _mm512_xor_si512( B[ 5], C[ 5] ); + __m512i x6 = B[ 6] = _mm512_xor_si512( B[ 6], C[ 6] ); + __m512i x7 = B[ 7] = _mm512_xor_si512( B[ 7], C[ 7] ); + __m512i x8 = B[ 8] = _mm512_xor_si512( B[ 8], C[ 8] ); + __m512i x9 = B[ 9] = _mm512_xor_si512( B[ 9], C[ 9] ); + __m512i xa = B[10] = _mm512_xor_si512( B[10], C[10] ); + __m512i xb = B[11] = _mm512_xor_si512( B[11], C[11] ); + __m512i xc = B[12] = _mm512_xor_si512( B[12], C[12] ); + __m512i xd = B[13] = _mm512_xor_si512( B[13], C[13] ); + __m512i xe = B[14] = _mm512_xor_si512( B[14], C[14] ); + __m512i xf = B[15] = _mm512_xor_si512( B[15], C[15] ); + + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm512_add_epi32( B[ 0], x0 ); + B[ 1] = _mm512_add_epi32( B[ 1], x1 ); + B[ 2] = _mm512_add_epi32( B[ 2], x2 ); + B[ 3] = _mm512_add_epi32( B[ 3], x3 ); + B[ 4] = _mm512_add_epi32( B[ 4], x4 ); + B[ 5] = _mm512_add_epi32( B[ 5], x5 ); + B[ 6] = _mm512_add_epi32( B[ 6], x6 ); + B[ 7] = _mm512_add_epi32( B[ 7], x7 ); + B[ 8] = _mm512_add_epi32( B[ 8], x8 ); + B[ 9] = _mm512_add_epi32( B[ 9], x9 ); + B[10] = _mm512_add_epi32( B[10], xa ); + B[11] = _mm512_add_epi32( B[11], xb ); + B[12] = _mm512_add_epi32( B[12], xc ); + B[13] = _mm512_add_epi32( B[13], xd ); + B[14] = _mm512_add_epi32( B[14], xe ); + B[15] = _mm512_add_epi32( B[15], xf ); +} + +void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 32], X, 128*16 ); + xor_salsa8_16way( &X[ 0], &X[16] ); + xor_salsa8_16way( &X[16], &X[ 0] ); + } + for ( int i = 0; i < N; i++ ) + { + m512_ovly *vptr[16]; // pointer to V offset for each lane + m512_ovly *x16 = (m512_ovly*)(&X[16]); + + // create pointers to V for each lane using data from each lane of X[16] + // as index. + for ( int l = 0; l < 16; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int k = 0; k < 32; k++ ) + { + m512_ovly v; // V value assembled from different indexes + for ( int l = 0; l < 8; l++ ) + v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; + X[ k ] = _mm512_xor_si512( X[ k ], v.m512 ); + } + + xor_salsa8_16way( &X[ 0], &X[16] ); + xor_salsa8_16way( &X[16], &X[ 0] ); + } +} + +// Working, not up to date, needs stream optimization. +// 4x32 interleaving +static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) +{ + __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + __m512i *B = (__m512i*)b; + const __m512i *C = (const __m512i*)c; + + // mix C into B then shuffle B into X + B[0] = _mm512_xor_si512( B[0], C[0] ); + B[1] = _mm512_xor_si512( B[1], C[1] ); + B[2] = _mm512_xor_si512( B[2], C[2] ); + B[3] = _mm512_xor_si512( B[3], C[3] ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[1], B[0] ); + X0 = _mm512_mask_blend_epi64( 0x30, B[3], B[2] ); + X0 = _mm512_mask_blend_epi64( 0x0f, X0, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[2], B[1] ); + X1 = _mm512_mask_blend_epi64( 0x30, B[0], B[3] ); + X1 = _mm512_mask_blend_epi64( 0x0f, X1, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[3], B[2] ); + X2 = _mm512_mask_blend_epi64( 0x30, B[1], B[0] ); + X2 = _mm512_mask_blend_epi64( 0x0f, X2, Y0 ); + + Y0 = _mm512_mask_blend_epi64( 0x03, B[0], B[3] ); + X3 = _mm512_mask_blend_epi64( 0x30, B[2], B[1] ); + X3 = _mm512_mask_blend_epi64( 0x0f, X3, Y0 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm512_shufll_128 + #define ROR_1X32 mm512_shuflr_128 + #define SWAP_64 mm512_swap_256 + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + Y0 = _mm512_mask_blend_epi64( 0xc0, X0, X1 ); + Y1 = _mm512_mask_blend_epi64( 0x03, X0, X1 ); + Y2 = _mm512_mask_blend_epi64( 0x0c, X0, X1 ); + Y3 = _mm512_mask_blend_epi64( 0x30, X0, X1 ); + + Y0 = _mm512_mask_blend_epi64( 0x30, Y0, X2 ); + Y1 = _mm512_mask_blend_epi64( 0xc0, Y1, X2 ); + Y2 = _mm512_mask_blend_epi64( 0x03, Y2, X2 ); + Y3 = _mm512_mask_blend_epi64( 0x0c, Y3, X2 ); + + Y0 = _mm512_mask_blend_epi64( 0x0c, Y0, X3 ); + Y1 = _mm512_mask_blend_epi64( 0x30, Y1, X3 ); + Y2 = _mm512_mask_blend_epi64( 0xc0, Y2, X3 ); + Y3 = _mm512_mask_blend_epi64( 0x03, Y3, X3 ); + + B[0] = _mm512_add_epi32( B[0], Y0 ); + B[1] = _mm512_add_epi32( B[1], Y1 ); + B[2] = _mm512_add_epi32( B[2], Y2 ); + B[3] = _mm512_add_epi32( B[3], Y3 ); +} + +// data format for 512 bits: 4 * ( 4 way 32 ) +// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2, +// l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 } + +void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 32], X, 4*128 ); + salsa8_simd128_4way( &X[ 0], &X[16] ); + salsa8_simd128_4way( &X[16], &X[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + uint32_t x16[4]; // index into V for each lane + memcpy( x16, &X[16], 16 ); + x16[0] = 32 * ( x16[0] & ( N-1) ); + x16[1] = 32 * ( x16[1] & ( N-1) ); + x16[2] = 32 * ( x16[2] & ( N-1) ); + x16[3] = 32 * ( x16[3] & ( N-1) ); + m128_ovly *v = (m128_ovly*)V; + + for( int k = 0; k < 32; k++ ) + { + X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3], + v[ x16[2] + k ].u32[2], + v[ x16[1] + k ].u32[1], + v[ x16[0] + k ].u32[0] ) ); + } + + salsa8_simd128_4way( &X[ 0], &X[16] ); + salsa8_simd128_4way( &X[16], &X[ 0] ); + } +} + +// not working, occasional accepted shares, not up to date. +// 4x128 interleaving +static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) +{ + __m512i X0, X1, X2, X3; + uint32_t *b = (uint32_t*)B; + m512_ovly y[4], z[4]; + + // mix C into B then shuffle B into X + B[0] = _mm512_xor_si512( B[0], C[0] ); + B[1] = _mm512_xor_si512( B[1], C[1] ); + B[2] = _mm512_xor_si512( B[2], C[2] ); + B[3] = _mm512_xor_si512( B[3], C[3] ); + + // { l3u15, l3u10, l3u5, l3u0, l2u15, l2u10, l2u5, l2u0, + // l1u15, l1u10, l1u5, l1u0, l0u15, l0u10, l0u5, l0u0 } + + // b index = row index + lane index + unit index + // = ( 8 * (u/4) ) + ( 4*l ) + ( u%4 ) + + X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12], // lane 3[3:0] + b[59], b[42], b[25], b[ 8], // lane 2[3:0] + b[55], b[38], b[21], b[ 4], // lane 1[3:0] + b[51], b[34], b[17], b[ 0] ); // lane 0[3:0] + + X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], + b[11], b[58], b[41], b[24], + b[ 7], b[54], b[37], b[20], + b[ 3], b[50], b[33], b[16] ); // lane 0[7:4] + + X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44], + b[27], b[10], b[57], b[40], + b[23], b[ 6], b[53], b[36], + b[19], b[ 2], b[49], b[32] ); + + X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60], + b[43], b[26], b[ 9], b[56], + b[39], b[22], b[ 5], b[52], + b[35], b[18], b[ 1], b[48] ); + + + + // define targets for macros used in round function template + #define ROL_1X32 mm512_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm512_shuflr128_32 + #define SWAP_64 mm512_swap128_64 + #define ROL32 _mm512_rol_epi32 + #define ADD32 _mm512_add_epi32 + #define XOR _mm512_xor_si512 + + SALSA_8ROUNDS_FINAL_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + y[0].m512 = X0; + y[1].m512 = X1; + y[2].m512 = X2; + y[3].m512 = X3; + + // lane 0 + z[0].u32[ 0 ] = y[0].u32[ 0]; + z[0].u32[ 3 ] = y[1].u32[ 0]; + z[0].u32[ 2 ] = y[2].u32[ 0]; + z[0].u32[ 1 ] = y[3].u32[ 0]; + + // lane 1 + z[0].u32[ 0+ 4 ] = y[0].u32[ 4]; + z[0].u32[ 3+ 4 ] = y[1].u32[ 4]; + z[0].u32[ 2+ 4 ] = y[2].u32[ 4]; + z[0].u32[ 1+ 4 ] = y[3].u32[ 4]; + + // lane 2 + z[0].u32[ 0+ 8 ] = y[0].u32[ 8]; + z[0].u32[ 3+ 8 ] = y[1].u32[ 8]; + z[0].u32[ 2+ 8 ] = y[2].u32[ 8]; + z[0].u32[ 1+ 8 ] = y[3].u32[ 8]; + + // lane 3 + z[0].u32[ 0+12 ] = y[0].u32[12]; + z[0].u32[ 3+12 ] = y[1].u32[12]; + z[0].u32[ 2+12 ] = y[2].u32[12]; + z[0].u32[ 1+12 ] = y[3].u32[12]; + + // lane 0 + z[1].u32[ 1 ] = y[0].u32[ 1]; + z[1].u32[ 0 ] = y[1].u32[ 1]; + z[1].u32[ 3 ] = y[2].u32[ 1]; + z[1].u32[ 2 ] = y[3].u32[ 1]; + + //lane 1 + z[1].u32[ 1+ 4 ] = y[0].u32[ 5]; + z[1].u32[ 0+ 4 ] = y[1].u32[ 5]; + z[1].u32[ 3+ 4 ] = y[2].u32[ 5]; + z[1].u32[ 2+ 4 ] = y[3].u32[ 5]; + + // lane 2 + z[1].u32[ 1+ 8 ] = y[0].u32[ 9]; + z[1].u32[ 0+ 8 ] = y[1].u32[ 9]; + z[1].u32[ 3+ 8 ] = y[2].u32[ 9]; + z[1].u32[ 2+ 8 ] = y[3].u32[ 9]; + + // lane 3 + z[1].u32[ 1+12 ] = y[0].u32[13]; + z[1].u32[ 0+12 ] = y[1].u32[13]; + z[1].u32[ 3+12 ] = y[2].u32[13]; + z[1].u32[ 2+12 ] = y[3].u32[13]; + + // lane 0 + z[2].u32[ 2 ] = y[0].u32[2]; + z[2].u32[ 1 ] = y[1].u32[2]; + z[2].u32[ 0 ] = y[2].u32[2]; + z[2].u32[ 3 ] = y[3].u32[2]; + + // lane 1 + z[2].u32[ 2+ 4 ] = y[0].u32[6]; + z[2].u32[ 1+ 4 ] = y[1].u32[6]; + z[2].u32[ 0+ 4 ] = y[2].u32[6]; + z[2].u32[ 3+ 4 ] = y[3].u32[6]; + + // lane 2 + z[2].u32[ 2+ 8 ] = y[0].u32[10]; + z[2].u32[ 1+ 8 ] = y[1].u32[10]; + z[2].u32[ 0+ 8 ] = y[2].u32[10]; + z[2].u32[ 3+ 8 ] = y[3].u32[10]; + + // lane 3 + z[2].u32[ 2+12 ] = y[0].u32[14]; + z[2].u32[ 1+12 ] = y[1].u32[14]; + z[2].u32[ 0+12 ] = y[2].u32[14]; + z[2].u32[ 3+12 ] = y[3].u32[14]; + + // lane 0 + z[3].u32[ 3 ] = y[0].u32[ 3]; + z[3].u32[ 2 ] = y[1].u32[ 3]; + z[3].u32[ 1 ] = y[2].u32[ 3]; + z[3].u32[ 0 ] = y[3].u32[ 3]; + + // lane 1 + z[3].u32[ 3+ 4 ] = y[0].u32[ 7]; + z[3].u32[ 2+ 4 ] = y[1].u32[ 7]; + z[3].u32[ 1+ 4 ] = y[2].u32[ 7]; + z[3].u32[ 0+ 4 ] = y[3].u32[ 7]; + + // lane 2 + z[3].u32[ 3+ 8 ] = y[0].u32[11]; + z[3].u32[ 2+ 8 ] = y[1].u32[11]; + z[3].u32[ 1+ 8 ] = y[2].u32[11]; + z[3].u32[ 0+ 8 ] = y[3].u32[11]; + + // lane 1 + z[3].u32[ 3+12 ] = y[0].u32[15]; + z[3].u32[ 2+12 ] = y[1].u32[15]; + z[3].u32[ 1+12 ] = y[2].u32[15]; + z[3].u32[ 0+12 ] = y[3].u32[15]; + + B[0] = _mm512_add_epi32( B[0], z[0].m512 ); + B[1] = _mm512_add_epi32( B[1], z[1].m512 ); + B[2] = _mm512_add_epi32( B[2], z[2].m512 ); + B[3] = _mm512_add_epi32( B[3], z[3].m512 ); +} + +void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 8], X, 128*4 ); + salsa8_4way_simd128( &X[0], &X[4] ); + salsa8_4way_simd128( &X[4], &X[0] ); + } + + for ( int i = 0; i < N; i++ ) + { + m512_ovly x16; + x16 = ( (m512_ovly*)X )[4]; + uint32_t j0 = 8 * ( x16.u32[ 0] & ( N-1 ) ); + uint32_t j1 = 8 * ( x16.u32[ 4] & ( N-1 ) ); + uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) ); + uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) ); + + for ( int k = 0; k < 8; k++ ) + X[k] = _mm512_xor_si512( X[k], m512_const_128( + ( (m512_ovly*)V )[ j3+k ].m128[3], + ( (m512_ovly*)V )[ j2+k ].m128[2], + ( (m512_ovly*)V )[ j1+k ].m128[1], + ( (m512_ovly*)V )[ j0+k ].m128[0] ) ); + +/* + for ( int k = 0; k < 8; k++ ) + X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( + V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) ); +*/ + salsa8_4way_simd128( &X[0], &X[4] ); + salsa8_4way_simd128( &X[4], &X[0] ); + } +} + + + +#endif // AVX512 + +#if defined(__AVX2__) + +// 8x memory usage +// Tested OK but slow scrypt, very slow scryptn2, 2x4way is faster +// Crashes with large N & many threads, OOM? Use only for scrypt +// 8x32 interleaving +static void salsa8_8way( __m256i * const B, const __m256i * const C ) +{ + __m256i x0 = B[ 0] = _mm256_xor_si256( B[ 0], C[ 0] ); + __m256i x1 = B[ 1] = _mm256_xor_si256( B[ 1], C[ 1] ); + __m256i x2 = B[ 2] = _mm256_xor_si256( B[ 2], C[ 2] ); + __m256i x3 = B[ 3] = _mm256_xor_si256( B[ 3], C[ 3] ); + __m256i x4 = B[ 4] = _mm256_xor_si256( B[ 4], C[ 4] ); + __m256i x5 = B[ 5] = _mm256_xor_si256( B[ 5], C[ 5] ); + __m256i x6 = B[ 6] = _mm256_xor_si256( B[ 6], C[ 6] ); + __m256i x7 = B[ 7] = _mm256_xor_si256( B[ 7], C[ 7] ); + __m256i x8 = B[ 8] = _mm256_xor_si256( B[ 8], C[ 8] ); + __m256i x9 = B[ 9] = _mm256_xor_si256( B[ 9], C[ 9] ); + __m256i xa = B[10] = _mm256_xor_si256( B[10], C[10] ); + __m256i xb = B[11] = _mm256_xor_si256( B[11], C[11] ); + __m256i xc = B[12] = _mm256_xor_si256( B[12], C[12] ); + __m256i xd = B[13] = _mm256_xor_si256( B[13], C[13] ); + __m256i xe = B[14] = _mm256_xor_si256( B[14], C[14] ); + __m256i xf = B[15] = _mm256_xor_si256( B[15], C[15] ); + + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm256_add_epi32( B[ 0], x0 ); + B[ 1] = _mm256_add_epi32( B[ 1], x1 ); + B[ 2] = _mm256_add_epi32( B[ 2], x2 ); + B[ 3] = _mm256_add_epi32( B[ 3], x3 ); + B[ 4] = _mm256_add_epi32( B[ 4], x4 ); + B[ 5] = _mm256_add_epi32( B[ 5], x5 ); + B[ 6] = _mm256_add_epi32( B[ 6], x6 ); + B[ 7] = _mm256_add_epi32( B[ 7], x7 ); + B[ 8] = _mm256_add_epi32( B[ 8], x8 ); + B[ 9] = _mm256_add_epi32( B[ 9], x9 ); + B[10] = _mm256_add_epi32( B[10], xa ); + B[11] = _mm256_add_epi32( B[11], xb ); + B[12] = _mm256_add_epi32( B[12], xc ); + B[13] = _mm256_add_epi32( B[13], xd ); + B[14] = _mm256_add_epi32( B[14], xe ); + B[15] = _mm256_add_epi32( B[15], xf ); +} + +void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 32], X, 128*8 ); + salsa8_8way( &X[ 0], &X[16] ); + salsa8_8way( &X[16], &X[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + m256_ovly *vptr[8]; // pointer to V offset for each lane + m256_ovly *x16 = (m256_ovly*)(&X[16]); + + // create pointers to V for each lane using data from each lane of X[16] + // as index. + for ( int l = 0; l < 8; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int k = 0; k < 32; k++ ) + { + m256_ovly v; // V value assembled from different indexes + for ( int l = 0; l < 8; l++ ) + v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; + X[ k ] = _mm256_xor_si256( X[ k ], v.m256 ); + } + + salsa8_8way( &X[ 0], &X[16] ); + salsa8_8way( &X[16], &X[ 0] ); + } +} + +// 2x memory usage +// Working, not up to date, needs stream optimization. +// Essentially Pooler 6way +// 2x128 interleaved simd128 +// ------- lane 1 ------- ------- lane 0 ------- +// { l1x3, l1x2, l1x1, l1x0, l0x3, l0x2, l0x1, l0x0 } b[3] B[ 7: 0] +// { l1x7, l1x6, l1x5, l1x4, l0x7, l0x6, l0x5, l0x4 } b[2] B[15: 8] +// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16] +// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24] + +static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) +{ + __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + + // mix C into B then shuffle B into X + B[0] = _mm256_xor_si256( B[0], C[0] ); + B[1] = _mm256_xor_si256( B[1], C[1] ); + B[2] = _mm256_xor_si256( B[2], C[2] ); + B[3] = _mm256_xor_si256( B[3], C[3] ); + + Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 ); + X0 = _mm256_blend_epi32( B[3], B[2], 0x44 ); + X0 = _mm256_blend_epi32( X0, Y0, 0x33); + + Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 ); + X1 = _mm256_blend_epi32( B[0], B[3], 0x44 ); + X1 = _mm256_blend_epi32( X1, Y1, 0x33 ); + + Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 ); + X2 = _mm256_blend_epi32( B[1], B[0], 0x44 ); + X2 = _mm256_blend_epi32( X2, Y2, 0x33 ); + + Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 ); + X3 = _mm256_blend_epi32( B[2], B[1], 0x44 ); + X3 = _mm256_blend_epi32( X3, Y3, 0x33 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + // init with X0 then blend in the other elements + + Y0 = _mm256_blend_epi32( X0, X1, 0x88 ); + Y1 = _mm256_blend_epi32( X0, X1, 0x11 ); + Y2 = _mm256_blend_epi32( X0, X1, 0x22 ); + Y3 = _mm256_blend_epi32( X0, X1, 0x44 ); + + Y0 = _mm256_blend_epi32( Y0, X2, 0x44 ); + Y1 = _mm256_blend_epi32( Y1, X2, 0x88 ); + Y2 = _mm256_blend_epi32( Y2, X2, 0x11 ); + Y3 = _mm256_blend_epi32( Y3, X2, 0x22 ); + + Y0 = _mm256_blend_epi32( Y0, X3, 0x22 ); + Y1 = _mm256_blend_epi32( Y1, X3, 0x44 ); + Y2 = _mm256_blend_epi32( Y2, X3, 0x88 ); + Y3 = _mm256_blend_epi32( Y3, X3, 0x11 ); + + B[0] = _mm256_add_epi32( B[0], Y0 ); + B[1] = _mm256_add_epi32( B[1], Y1 ); + B[2] = _mm256_add_epi32( B[2], Y2 ); + B[3] = _mm256_add_epi32( B[3], Y3 ); +} + +void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 8], X, 128*2 ); + salsa8_2way_simd128( &X[0], &X[4] ); + salsa8_2way_simd128( &X[4], &X[0] ); + } + + for ( int i = 0; i < N; i++ ) + { + m256_ovly x16; + x16 = ( (m256_ovly*)X )[4]; + uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) ); + uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) ); + + for ( int k = 0; k < 8; k++ ) + X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ], + V[ j0+k ], 0x0f ) ); + + salsa8_2way_simd128( &X[0], &X[4] ); + salsa8_2way_simd128( &X[4], &X[0] ); + } +} + +// Working +// 2x128 interleaving +static void salsa8_2way_simd128_2buf( __m256i * const BA, __m256i * const BB, + const __m256i * const CA, const __m256i * const CB ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x33); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x33); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_2BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + +} + +void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ) +{ + __m256i *X0 = X; + __m256i *X1 = X + 8; + __m256i *V0 = V; + __m256i *V1 = V + 8*N; + + for ( int i = 0; i < N; i++ ) + { + for ( int k = 0; k < 8; k++ ) + { + _mm256_stream_si256( V0 + i*8 + k, X0[k] ); + _mm256_stream_si256( V1 + i*8 + k, X1[k] ); + } + salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); + salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] ); + } + for ( int i = 0; i < N; i++ ) + { + const m256_ovly x16a = ( (m256_ovly*)X0 )[4]; + const m256_ovly x16b = ( (m256_ovly*)X1 )[4]; + + const uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) ); + const uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) ); + const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) ); + const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); + + for ( int k = 0; k < 8; k++ ) + { + const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k ); + const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k ); + const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k ); + const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k ); + X0[k] = _mm256_xor_si256( X0[k], + _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) ); + X1[k] = _mm256_xor_si256( X1[k], + _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) ); + + +/* + X0[k] = _mm256_xor_si256( X0[k], + _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) ); + X1[k] = _mm256_xor_si256( X1[k], + _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) ); +*/ + + } + + salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); + salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] ); + } +} + +// Triple buffered, not up to date, needs stream optimization +// 2x128 interleaving +static void salsa8_2way_simd128_3buf( __m256i * const BA, __m256i * const BB, + __m256i * const BC, const __m256i * const CA, const __m256i * const CB, + const __m256i * const CC ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BC[0] = _mm256_xor_si256( BC[0], CC[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BC[1] = _mm256_xor_si256( BC[1], CC[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BC[2] = _mm256_xor_si256( BC[2], CC[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + BC[3] = _mm256_xor_si256( BC[3], CC[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 ); + YC0 = _mm256_blend_epi32( BC[1], BC[0], 0x11 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 ); + XC0 = _mm256_blend_epi32( BC[3], BC[2], 0x44 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x33); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x33); + XC0 = _mm256_blend_epi32( XC0, YC0, 0x33); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 ); + YC0 = _mm256_blend_epi32( BC[2], BC[1], 0x11 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 ); + XC1 = _mm256_blend_epi32( BC[0], BC[3], 0x44 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 ); + XC1 = _mm256_blend_epi32( XC1, YC0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 ); + YC0 = _mm256_blend_epi32( BC[3], BC[2], 0x11 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 ); + XC2 = _mm256_blend_epi32( BC[1], BC[0], 0x44 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 ); + XC2 = _mm256_blend_epi32( XC2, YC0, 0x33 ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 ); + YC0 = _mm256_blend_epi32( BC[0], BC[3], 0x11 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 ); + XC3 = _mm256_blend_epi32( BC[2], BC[1], 0x44 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 ); + XC3 = _mm256_blend_epi32( XC3, YC0, 0x33 ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes + #define ROR_1X32 mm256_shuflr128_32 + #define SWAP_64 mm256_swap128_64 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_3BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 ); + YC0 = _mm256_blend_epi32( XC0, XC1, 0x88 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 ); + YC1 = _mm256_blend_epi32( XC0, XC1, 0x11 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 ); + YC2 = _mm256_blend_epi32( XC0, XC1, 0x22 ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 ); + YC3 = _mm256_blend_epi32( XC0, XC1, 0x44 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 ); + YC0 = _mm256_blend_epi32( YC0, XC2, 0x44 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 ); + YC1 = _mm256_blend_epi32( YC1, XC2, 0x88 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 ); + YC2 = _mm256_blend_epi32( YC2, XC2, 0x11 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 ); + YC3 = _mm256_blend_epi32( YC3, XC2, 0x22 ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 ); + YC0 = _mm256_blend_epi32( YC0, XC3, 0x22 ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 ); + YC1 = _mm256_blend_epi32( YC1, XC3, 0x44 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 ); + YC2 = _mm256_blend_epi32( YC2, XC3, 0x88 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 ); + YC3 = _mm256_blend_epi32( YC3, XC3, 0x11 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BC[0] = _mm256_add_epi32( BC[0], YC0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BC[1] = _mm256_add_epi32( BC[1], YC1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BC[2] = _mm256_add_epi32( BC[2], YC2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + BC[3] = _mm256_add_epi32( BC[3], YC3 ); + +} + +void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ) +{ + __m256i *X0 = X; + __m256i *X1 = X+8; + __m256i *X2 = X+16; + __m256i *V0 = V; + __m256i *V1 = V + 8*N; + __m256i *V2 = V + 16*N; + + for ( int i = 0; i < N; i++ ) + { + memcpy( &V0[i * 8], X0, 128*2 ); + memcpy( &V1[i * 8], X1, 128*2 ); + memcpy( &V2[i * 8], X2, 128*2 ); + salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], + &X0[4], &X1[4], &X2[4] ); + salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4], + &X0[0], &X1[0], &X2[0] ); + } + for ( int i = 0; i < N; i++ ) + { + m256_ovly x16a, x16b, x16c; + x16a = ( (m256_ovly*)X0 )[4]; + x16b = ( (m256_ovly*)X1 )[4]; + x16c = ( (m256_ovly*)X2 )[4]; + + uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) ); + uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) ); + uint32_t j0c = 8 * ( x16c.u32[0] & ( N-1 ) ); + uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) ); + uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); + uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) ); + + for ( int k = 0; k < 8; k++ ) + { + X0[k] = _mm256_xor_si256( X0[k], + _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) ); + X1[k] = _mm256_xor_si256( X1[k], + _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) ); + X2[k] = _mm256_xor_si256( X2[k], + _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) ); + } + + salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], + &X0[4], &X1[4], &X2[4] ); + salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4], + &X0[0], &X1[0], &X2[0] ); + } +} + + +// 2x memory usage + +// Tested OK, good speed +// +// Serial SIMD over 2 way parallel + +// Uses uint64_t as a poorman's vector then applying linear SIMD to the +// pairs of data. +// +// Interleaving is standard 2 way. +// Use 64 bit shuffles but 32 bit arithmetic. + +// B = { lane1, lane0 } +// b[i] = { B[4*i+3], B[4*i+2], B[4*i+1], B[4*i] } + +// 2x32 interleaving +static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c ) +{ + __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + __m256i *B = (__m256i*)b; + const __m256i *C = (const __m256i*)c; + + // mix C into B then shuffle B into X + B[0] = _mm256_xor_si256( B[0], C[0] ); + B[1] = _mm256_xor_si256( B[1], C[1] ); + B[2] = _mm256_xor_si256( B[2], C[2] ); + B[3] = _mm256_xor_si256( B[3], C[3] ); + + Y0 = _mm256_blend_epi32( B[1], B[0], 0x03 ); + X0 = _mm256_blend_epi32( B[3], B[2], 0x30 ); + X0 = _mm256_blend_epi32( X0, Y0, 0x0f); + + Y0 = _mm256_blend_epi32( B[2], B[1], 0x03 ); + X1 = _mm256_blend_epi32( B[0], B[3], 0x30 ); + X1 = _mm256_blend_epi32( X1, Y0, 0x0f ); + + Y0 = _mm256_blend_epi32( B[3], B[2], 0x03 ); + X2 = _mm256_blend_epi32( B[1], B[0], 0x30 ); + X2 = _mm256_blend_epi32( X2, Y0, 0x0f ); + + Y0 = _mm256_blend_epi32( B[0], B[3], 0x03 ); + X3 = _mm256_blend_epi32( B[2], B[1], 0x30 ); + X3 = _mm256_blend_epi32( X3, Y0, 0x0f ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + + SALSA_8ROUNDS_SIMD128; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + + Y0 = _mm256_blend_epi32( X0, X1, 0xc0 ); + Y1 = _mm256_blend_epi32( X0, X1, 0x03 ); + Y2 = _mm256_blend_epi32( X0, X1, 0x0c ); + Y3 = _mm256_blend_epi32( X0, X1, 0x30 ); + + Y0 = _mm256_blend_epi32( Y0, X2, 0x30 ); + Y1 = _mm256_blend_epi32( Y1, X2, 0xc0 ); + Y2 = _mm256_blend_epi32( Y2, X2, 0x03 ); + Y3 = _mm256_blend_epi32( Y3, X2, 0x0c ); + + Y0 = _mm256_blend_epi32( Y0, X3, 0x0c ); + Y1 = _mm256_blend_epi32( Y1, X3, 0x30 ); + Y2 = _mm256_blend_epi32( Y2, X3, 0xc0 ); + Y3 = _mm256_blend_epi32( Y3, X3, 0x03 ); + + B[0] = _mm256_add_epi32( B[0], Y0 ); + B[1] = _mm256_add_epi32( B[1], Y1 ); + B[2] = _mm256_add_epi32( B[2], Y2 ); + B[3] = _mm256_add_epi32( B[3], Y3 ); + +} + +// data format for 256 bits: 4 * ( 2 way 32 ) +// { l1d3, l0d3, l1d2, l0d2, l1d1, l0d1, l1d0, l0d0 } + +void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + for ( int k = 0; k < 8; k++ ) + _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) ); + salsa8_simd128_2way( &X[ 0], &X[16] ); + salsa8_simd128_2way( &X[16], &X[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + // need 2 J's + const uint32_t j0 = 32 * ( (uint32_t)( X[16] ) & ( N-1 ) ); + const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) ); + + for ( int k = 0; k < 32; k++ ) + X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 ) + | ( V[ j0 + k ] & 0x00000000ffffffff ) ); + + salsa8_simd128_2way( &X[ 0], &X[16] ); + salsa8_simd128_2way( &X[16], &X[ 0] ); + } +} + +// Double buffered, 4x memory usage +// 2x32 interleaving +static void salsa8_simd128_2way_2buf( uint64_t *ba, uint64_t *bb, + const uint64_t *ca, const uint64_t *cb ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + __m256i *BA = (__m256i*)ba; + __m256i *BB = (__m256i*)bb; + const __m256i *CA = (const __m256i*)ca; + const __m256i *CB = (const __m256i*)cb; + + // mix C into B then shuffle B into X + BA[0] = _mm256_xor_si256( BA[0], CA[0] ); + BB[0] = _mm256_xor_si256( BB[0], CB[0] ); + BA[1] = _mm256_xor_si256( BA[1], CA[1] ); + BB[1] = _mm256_xor_si256( BB[1], CB[1] ); + BA[2] = _mm256_xor_si256( BA[2], CA[2] ); + BB[2] = _mm256_xor_si256( BB[2], CB[2] ); + BA[3] = _mm256_xor_si256( BA[3], CA[3] ); + BB[3] = _mm256_xor_si256( BB[3], CB[3] ); + + YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x03 ); + YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x03 ); + XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x30 ); + XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x30 ); + XA0 = _mm256_blend_epi32( XA0, YA0, 0x0f); + XB0 = _mm256_blend_epi32( XB0, YB0, 0x0f); + + YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x03 ); + YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x03 ); + XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x30 ); + XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x30 ); + XA1 = _mm256_blend_epi32( XA1, YA0, 0x0f ); + XB1 = _mm256_blend_epi32( XB1, YB0, 0x0f ); + + YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x03 ); + YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x03 ); + XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x30 ); + XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x30 ); + XA2 = _mm256_blend_epi32( XA2, YA0, 0x0f ); + XB2 = _mm256_blend_epi32( XB2, YB0, 0x0f ); + + YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x03 ); + YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x03 ); + XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x30 ); + XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x30 ); + XA3 = _mm256_blend_epi32( XA3, YA0, 0x0f ); + XB3 = _mm256_blend_epi32( XB3, YB0, 0x0f ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_SIMD128_2BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + YA0 = _mm256_blend_epi32( XA0, XA1, 0xc0 ); + YB0 = _mm256_blend_epi32( XB0, XB1, 0xc0 ); + YA1 = _mm256_blend_epi32( XA0, XA1, 0x03 ); + YB1 = _mm256_blend_epi32( XB0, XB1, 0x03 ); + YA2 = _mm256_blend_epi32( XA0, XA1, 0x0c ); + YB2 = _mm256_blend_epi32( XB0, XB1, 0x0c ); + YA3 = _mm256_blend_epi32( XA0, XA1, 0x30 ); + YB3 = _mm256_blend_epi32( XB0, XB1, 0x30 ); + + YA0 = _mm256_blend_epi32( YA0, XA2, 0x30 ); + YB0 = _mm256_blend_epi32( YB0, XB2, 0x30 ); + YA1 = _mm256_blend_epi32( YA1, XA2, 0xc0 ); + YB1 = _mm256_blend_epi32( YB1, XB2, 0xc0 ); + YA2 = _mm256_blend_epi32( YA2, XA2, 0x03 ); + YB2 = _mm256_blend_epi32( YB2, XB2, 0x03 ); + YA3 = _mm256_blend_epi32( YA3, XA2, 0x0c ); + YB3 = _mm256_blend_epi32( YB3, XB2, 0x0c ); + + YA0 = _mm256_blend_epi32( YA0, XA3, 0x0c ); + YB0 = _mm256_blend_epi32( YB0, XB3, 0x0c ); + YA1 = _mm256_blend_epi32( YA1, XA3, 0x30 ); + YB1 = _mm256_blend_epi32( YB1, XB3, 0x30 ); + YA2 = _mm256_blend_epi32( YA2, XA3, 0xc0 ); + YB2 = _mm256_blend_epi32( YB2, XB3, 0xc0 ); + YA3 = _mm256_blend_epi32( YA3, XA3, 0x03 ); + YB3 = _mm256_blend_epi32( YB3, XB3, 0x03 ); + + BA[0] = _mm256_add_epi32( BA[0], YA0 ); + BB[0] = _mm256_add_epi32( BB[0], YB0 ); + BA[1] = _mm256_add_epi32( BA[1], YA1 ); + BB[1] = _mm256_add_epi32( BB[1], YB1 ); + BA[2] = _mm256_add_epi32( BA[2], YA2 ); + BB[2] = _mm256_add_epi32( BB[2], YB2 ); + BA[3] = _mm256_add_epi32( BA[3], YA3 ); + BB[3] = _mm256_add_epi32( BB[3], YB3 ); + +} + +void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ) + +{ + uint64_t *X0 = X; + uint64_t *X1 = X+32; + uint64_t *V0 = V; + uint64_t *V1 = V + 32*N; + + for ( int i = 0; i < N; i++ ) + { + for ( int k = 0; k < 8; k++ ) + { + _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) ); + _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) ); + } + salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + // need 4 J's + const uint32_t j0l = 32 * ( (const uint32_t)( X0[16] ) & ( N-1 ) ); + const uint32_t j0h = 32 * ( (const uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); + const uint32_t j1l = 32 * ( (const uint32_t)( X1[16] ) & ( N-1 ) ); + const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); + + for ( int k = 0; k < 32; k++ ) + { + X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) + | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); + X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) + | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); + } + salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } +} + +// Working, deprecated, not up to date +// Triple buffered 2 way, 6x memory usage +// 2x32 interleaving +static void salsa8_simd128_2way_3buf( uint64_t *BA, uint64_t *BB, + uint64_t *BC, const uint64_t *CA, const uint64_t *CB, + const uint64_t *CC ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3; + __m256i *ba = (__m256i*)BA; + __m256i *bb = (__m256i*)BB; + __m256i *bc = (__m256i*)BC; + const __m256i *ca = (const __m256i*)CA; + const __m256i *cb = (const __m256i*)CB; + const __m256i *cc = (const __m256i*)CC; + m256_ovly ya[4], yb[4], yc[4], + za[4], zb[4], zc[4]; + + // mix C into B then shuffle B into X + ba[0] = _mm256_xor_si256( ba[0], ca[0] ); + bb[0] = _mm256_xor_si256( bb[0], cb[0] ); + bc[0] = _mm256_xor_si256( bc[0], cc[0] ); + ba[1] = _mm256_xor_si256( ba[1], ca[1] ); + bb[1] = _mm256_xor_si256( bb[1], cb[1] ); + bc[1] = _mm256_xor_si256( bc[1], cc[1] ); + ba[2] = _mm256_xor_si256( ba[2], ca[2] ); + bb[2] = _mm256_xor_si256( bb[2], cb[2] ); + bc[2] = _mm256_xor_si256( bc[2], cc[2] ); + ba[3] = _mm256_xor_si256( ba[3], ca[3] ); + bb[3] = _mm256_xor_si256( bb[3], cb[3] ); + bc[3] = _mm256_xor_si256( bc[3], cc[3] ); + + XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] ); + XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] ); + XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] ); + XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] ); + XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] ); + XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] ); + XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] ); + XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] ); + XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] ); + XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] ); + XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] ); + XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_FINAL_SIMD128_3BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + ya[0].m256 = XA0; yb[0].m256 = XB0; + yc[0].m256 = XC0; + ya[1].m256 = XA1; yb[1].m256 = XB1; + yc[1].m256 = XC1; + ya[2].m256 = XA2; yb[2].m256 = XB2; + yc[2].m256 = XC2; + ya[3].m256 = XA3; yb[3].m256 = XB3; + yc[3].m256 = XC3; + + za[0].u64[0] = ya[0].u64[0]; + zb[0].u64[0] = yb[0].u64[0]; + zc[0].u64[0] = yc[0].u64[0]; + za[0].u64[3] = ya[1].u64[0]; + zb[0].u64[3] = yb[1].u64[0]; + zc[0].u64[3] = yc[1].u64[0]; + za[0].u64[2] = ya[2].u64[0]; + zb[0].u64[2] = yb[2].u64[0]; + zc[0].u64[2] = yc[2].u64[0]; + za[0].u64[1] = ya[3].u64[0]; + zb[0].u64[1] = yb[3].u64[0]; + zc[0].u64[1] = yc[3].u64[0]; + + za[1].u64[1] = ya[0].u64[1]; + zb[1].u64[1] = yb[0].u64[1]; + zc[1].u64[1] = yc[0].u64[1]; + za[1].u64[0] = ya[1].u64[1]; + zb[1].u64[0] = yb[1].u64[1]; + zc[1].u64[0] = yc[1].u64[1]; + za[1].u64[3] = ya[2].u64[1]; + zb[1].u64[3] = yb[2].u64[1]; + zc[1].u64[3] = yc[2].u64[1]; + za[1].u64[2] = ya[3].u64[1]; + zb[1].u64[2] = yb[3].u64[1]; + zc[1].u64[2] = yc[3].u64[1]; + + za[2].u64[2] = ya[0].u64[2]; + zb[2].u64[2] = yb[0].u64[2]; + zc[2].u64[2] = yc[0].u64[2]; + za[2].u64[1] = ya[1].u64[2]; + zb[2].u64[1] = yb[1].u64[2]; + zc[2].u64[1] = yc[1].u64[2]; + za[2].u64[0] = ya[2].u64[2]; + zb[2].u64[0] = yb[2].u64[2]; + zc[2].u64[0] = yc[2].u64[2]; + za[2].u64[3] = ya[3].u64[2]; + zb[2].u64[3] = yb[3].u64[2]; + zc[2].u64[3] = yc[3].u64[2]; + + za[3].u64[3] = ya[0].u64[3]; + zb[3].u64[3] = yb[0].u64[3]; + zc[3].u64[3] = yc[0].u64[3]; + za[3].u64[2] = ya[1].u64[3]; + zb[3].u64[2] = yb[1].u64[3]; + zc[3].u64[2] = yc[1].u64[3]; + za[3].u64[1] = ya[2].u64[3]; + zb[3].u64[1] = yb[2].u64[3]; + zc[3].u64[1] = yc[2].u64[3]; + za[3].u64[0] = ya[3].u64[3]; + zb[3].u64[0] = yb[3].u64[3]; + zc[3].u64[0] = yc[3].u64[3]; + + ba[0] = _mm256_add_epi32( ba[0], za[0].m256 ); + bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 ); + bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 ); + ba[1] = _mm256_add_epi32( ba[1], za[1].m256 ); + bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 ); + bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 ); + ba[2] = _mm256_add_epi32( ba[2], za[2].m256 ); + bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 ); + bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 ); + ba[3] = _mm256_add_epi32( ba[3], za[3].m256 ); + bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 ); + bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 ); +} + +void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, + const uint32_t N ) +{ + uint64_t *X0 = X; + uint64_t *X1 = X+32; + uint64_t *X2 = X+64; + uint64_t *V0 = V; + uint64_t *V1 = V + 32*N; + uint64_t *V2 = V + 64*N; + + for ( int i = 0; i < N; i++ ) + { + memcpy( &V0[i * 32], X0, 2*128 ); + memcpy( &V1[i * 32], X1, 2*128 ); + memcpy( &V2[i * 32], X2, 2*128 ); + salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) ); + uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); + uint32_t j1l = 32 * ( (uint32_t)( X1[16] ) & ( N-1 ) ); + uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); + uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) ); + uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) ); + + for ( int k = 0; k < 32; k++ ) + { + X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) + | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); + X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) + | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); + X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 ) + | ( V2[ j2l + k ] & 0x00000000ffffffff ) ); + } + salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } +} + +// Working, deprecated +// 8x memory usage +// 2x32 interleaving +static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB, + uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB, + const uint64_t *CC, const uint64_t *CD ) +{ + __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3; + __m256i *ba = (__m256i*)BA; + __m256i *bb = (__m256i*)BB; + __m256i *bc = (__m256i*)BC; + __m256i *bd = (__m256i*)BD; + const __m256i *ca = (const __m256i*)CA; + const __m256i *cb = (const __m256i*)CB; + const __m256i *cc = (const __m256i*)CC; + const __m256i *cd = (const __m256i*)CD; + m256_ovly ya[4], yb[4], yc[4], yd[4], + za[4], zb[4], zc[4], zd[4]; + + // mix C into B then shuffle B into X + ba[0] = _mm256_xor_si256( ba[0], ca[0] ); + bb[0] = _mm256_xor_si256( bb[0], cb[0] ); + bc[0] = _mm256_xor_si256( bc[0], cc[0] ); + bd[0] = _mm256_xor_si256( bd[0], cd[0] ); + ba[1] = _mm256_xor_si256( ba[1], ca[1] ); + bb[1] = _mm256_xor_si256( bb[1], cb[1] ); + bc[1] = _mm256_xor_si256( bc[1], cc[1] ); + bd[1] = _mm256_xor_si256( bd[1], cd[1] ); + ba[2] = _mm256_xor_si256( ba[2], ca[2] ); + bb[2] = _mm256_xor_si256( bb[2], cb[2] ); + bc[2] = _mm256_xor_si256( bc[2], cc[2] ); + bd[2] = _mm256_xor_si256( bd[2], cd[2] ); + ba[3] = _mm256_xor_si256( ba[3], ca[3] ); + bb[3] = _mm256_xor_si256( bb[3], cb[3] ); + bc[3] = _mm256_xor_si256( bc[3], cc[3] ); + bd[3] = _mm256_xor_si256( bd[3], cd[3] ); + + XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] ); + XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] ); + XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] ); + XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] ); + XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] ); + XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] ); + XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] ); + XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] ); + XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] ); + XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] ); + XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] ); + XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] ); + XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] ); + XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] ); + XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] ); + XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] ); + + // define targets for macros used in round function template + #define ROL_1X32 mm256_shufll_64 + #define ROR_1X32 mm256_shuflr_64 + #define SWAP_64 mm256_swap_128 + #define ROL32 mm256_rol_32 + #define ADD32 _mm256_add_epi32 + #define XOR _mm256_xor_si256 + #define TYPE __m256i + + SALSA_8ROUNDS_FINAL_SIMD128_4BUF; + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE + + ya[0].m256 = XA0; yb[0].m256 = XB0; + yc[0].m256 = XC0; yd[0].m256 = XD0; + ya[1].m256 = XA1; yb[1].m256 = XB1; + yc[1].m256 = XC1; yd[1].m256 = XD1; + ya[2].m256 = XA2; yb[2].m256 = XB2; + yc[2].m256 = XC2; yd[2].m256 = XD2; + ya[3].m256 = XA3; yb[3].m256 = XB3; + yc[3].m256 = XC3; yd[3].m256 = XD3; + + za[0].u64[0] = ya[0].u64[0]; + zb[0].u64[0] = yb[0].u64[0]; + zc[0].u64[0] = yc[0].u64[0]; + zd[0].u64[0] = yd[0].u64[0]; + za[0].u64[3] = ya[1].u64[0]; + zb[0].u64[3] = yb[1].u64[0]; + zc[0].u64[3] = yc[1].u64[0]; + zd[0].u64[3] = yd[1].u64[0]; + za[0].u64[2] = ya[2].u64[0]; + zb[0].u64[2] = yb[2].u64[0]; + zc[0].u64[2] = yc[2].u64[0]; + zd[0].u64[2] = yd[2].u64[0]; + za[0].u64[1] = ya[3].u64[0]; + zb[0].u64[1] = yb[3].u64[0]; + zc[0].u64[1] = yc[3].u64[0]; + zd[0].u64[1] = yd[3].u64[0]; + + za[1].u64[1] = ya[0].u64[1]; + zb[1].u64[1] = yb[0].u64[1]; + zc[1].u64[1] = yc[0].u64[1]; + zd[1].u64[1] = yd[0].u64[1]; + za[1].u64[0] = ya[1].u64[1]; + zb[1].u64[0] = yb[1].u64[1]; + zc[1].u64[0] = yc[1].u64[1]; + zd[1].u64[0] = yd[1].u64[1]; + za[1].u64[3] = ya[2].u64[1]; + zb[1].u64[3] = yb[2].u64[1]; + zc[1].u64[3] = yc[2].u64[1]; + zd[1].u64[3] = yd[2].u64[1]; + za[1].u64[2] = ya[3].u64[1]; + zb[1].u64[2] = yb[3].u64[1]; + zc[1].u64[2] = yc[3].u64[1]; + zd[1].u64[2] = yd[3].u64[1]; + + za[2].u64[2] = ya[0].u64[2]; + zb[2].u64[2] = yb[0].u64[2]; + zc[2].u64[2] = yc[0].u64[2]; + zd[2].u64[2] = yd[0].u64[2]; + za[2].u64[1] = ya[1].u64[2]; + zb[2].u64[1] = yb[1].u64[2]; + zc[2].u64[1] = yc[1].u64[2]; + zd[2].u64[1] = yd[1].u64[2]; + za[2].u64[0] = ya[2].u64[2]; + zb[2].u64[0] = yb[2].u64[2]; + zc[2].u64[0] = yc[2].u64[2]; + zd[2].u64[0] = yd[2].u64[2]; + za[2].u64[3] = ya[3].u64[2]; + zb[2].u64[3] = yb[3].u64[2]; + zc[2].u64[3] = yc[3].u64[2]; + zd[2].u64[3] = yd[3].u64[2]; + + za[3].u64[3] = ya[0].u64[3]; + zb[3].u64[3] = yb[0].u64[3]; + zc[3].u64[3] = yc[0].u64[3]; + zd[3].u64[3] = yd[0].u64[3]; + za[3].u64[2] = ya[1].u64[3]; + zb[3].u64[2] = yb[1].u64[3]; + zc[3].u64[2] = yc[1].u64[3]; + zd[3].u64[2] = yd[1].u64[3]; + za[3].u64[1] = ya[2].u64[3]; + zb[3].u64[1] = yb[2].u64[3]; + zc[3].u64[1] = yc[2].u64[3]; + zd[3].u64[1] = yd[2].u64[3]; + za[3].u64[0] = ya[3].u64[3]; + zb[3].u64[0] = yb[3].u64[3]; + zc[3].u64[0] = yc[3].u64[3]; + zd[3].u64[0] = yd[3].u64[3]; + + ba[0] = _mm256_add_epi32( ba[0], za[0].m256 ); + bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 ); + bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 ); + bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 ); + ba[1] = _mm256_add_epi32( ba[1], za[1].m256 ); + bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 ); + bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 ); + bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 ); + ba[2] = _mm256_add_epi32( ba[2], za[2].m256 ); + bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 ); + bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 ); + bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 ); + ba[3] = _mm256_add_epi32( ba[3], za[3].m256 ); + bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 ); + bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 ); + bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 ); +} + +void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N ) + +{ + uint64_t *X0 = X; + uint64_t *X1 = X+32; + uint64_t *X2 = X+64; + uint64_t *X3 = X+96; + uint64_t *V0 = V; + uint64_t *V1 = V + 32*N; + uint64_t *V2 = V + 64*N; + uint64_t *V3 = V + 96*N; + + for ( int i = 0; i < N; i++ ) + { + memcpy( &V0[i * 32], X0, 2*128 ); + memcpy( &V1[i * 32], X1, 2*128 ); + memcpy( &V2[i * 32], X2, 2*128 ); + memcpy( &V3[i * 32], X3, 2*128 ); + salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], + &X0[16], &X1[16], &X2[16], &X3[16] ); + salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16], + &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + // need 4 J's + uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) ); + uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); + uint32_t j1l = 32 * ( (uint32_t)( X1[16] ) & ( N-1 ) ); + uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); + uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) ); + uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) ); + uint32_t j3l = 32 * ( (uint32_t)( X3[16] ) & ( N-1 ) ); + uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) ); + + for ( int k = 0; k < 32; k++ ) + { + X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) + | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); + X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) + | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); + X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 ) + | ( V2[ j2l + k ] & 0x00000000ffffffff ) ); + X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 ) + | ( V3[ j3l + k ] & 0x00000000ffffffff ) ); + } + salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], + &X0[16], &X1[16], &X2[16], &X3[16] ); + salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16], + &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); + } +} + + +#endif // AVX2 + +#if defined(__SSE2__) // required and assumed + +// Simple 4 way parallel. +// Tested OK +// Scyptn2 a little slower than pooler +// Scrypt 2x faster than pooler +// 4x memory usage +// 4x32 interleaving +static void xor_salsa8_4way( __m128i * const B, const __m128i * const C ) +{ + __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] ); + __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] ); + __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] ); + __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] ); + __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] ); + __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] ); + __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] ); + __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] ); + __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] ); + __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] ); + __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] ); + __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] ); + __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] ); + __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] ); + __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] ); + __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] ); + + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] = _mm_add_epi32( B[ 0], x0 ); + B[ 1] = _mm_add_epi32( B[ 1], x1 ); + B[ 2] = _mm_add_epi32( B[ 2], x2 ); + B[ 3] = _mm_add_epi32( B[ 3], x3 ); + B[ 4] = _mm_add_epi32( B[ 4], x4 ); + B[ 5] = _mm_add_epi32( B[ 5], x5 ); + B[ 6] = _mm_add_epi32( B[ 6], x6 ); + B[ 7] = _mm_add_epi32( B[ 7], x7 ); + B[ 8] = _mm_add_epi32( B[ 8], x8 ); + B[ 9] = _mm_add_epi32( B[ 9], x9 ); + B[10] = _mm_add_epi32( B[10], xa ); + B[11] = _mm_add_epi32( B[11], xb ); + B[12] = _mm_add_epi32( B[12], xc ); + B[13] = _mm_add_epi32( B[13], xd ); + B[14] = _mm_add_epi32( B[14], xe ); + B[15] = _mm_add_epi32( B[15], xf ); +} + +void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 32], X, 128*4 ); + xor_salsa8_4way( &X[ 0], &X[16] ); + xor_salsa8_4way( &X[16], &X[ 0] ); + } + for ( int i = 0; i < N; i++ ) + { + m128_ovly *vptr[4]; + m128_ovly *x16 = (m128_ovly*)(&X[16]); + + for ( int l = 0; l < 4; l++ ) + { + uint32_t xl = (*x16).u32[l]; + vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); + } + + for ( int k = 0; k < 32; k++ ) + { + m128_ovly v; + for ( int l = 0; l < 4; l++ ) + v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; + X[ k ] = _mm_xor_si128( X[ k ], v.m128 ); + } + + xor_salsa8_4way( &X[ 0], &X[16] ); + xor_salsa8_4way( &X[16], &X[ 0] ); + } +} + + +// Linear SIMD single thread. No memory increase but some shuffling overhead +// required. + +// 4 way 32 bit interleaved single 32 bit thread, interleave while loading, +// deinterleave while storing, do 2 way 128 & 4 way 128 parallel on top. +// +// SALSA_2ROUNDS( {x0,x5,xa,xf}, {x4,x9,xe,x3}, {x8,xd,x2,x7}, {xc,x1,x6,xb}) + +// Tested OK. +// No interleaving +static void salsa8_simd128( uint32_t *b, const uint32_t * const c) +{ + __m128i X0, X1, X2, X3; + __m128i *B = (__m128i*)b; + const __m128i *C = (const __m128i*)c; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + + // mix C into B then shuffle B into X + B[0] = _mm_xor_si128( B[0], C[0] ); + B[1] = _mm_xor_si128( B[1], C[1] ); + B[2] = _mm_xor_si128( B[2], C[2] ); + B[3] = _mm_xor_si128( B[3], C[3] ); + +#if defined(__SSE4_1__) + + __m128i Y0, Y1, Y2, Y3; + +#if defined(__AVX2__) + + Y0 = _mm_blend_epi32( B[1], B[0], 0x1 ); + X0 = _mm_blend_epi32( B[3], B[2], 0x4 ); + Y1 = _mm_blend_epi32( B[2], B[1], 0x1 ); + X1 = _mm_blend_epi32( B[0], B[3], 0x4 ); + Y2 = _mm_blend_epi32( B[3], B[2], 0x1 ); + X2 = _mm_blend_epi32( B[1], B[0], 0x4 ); + Y3 = _mm_blend_epi32( B[0], B[3], 0x1 ); + X3 = _mm_blend_epi32( B[2], B[1], 0x4 ); + X0 = _mm_blend_epi32( X0, Y0, 0x3); + X1 = _mm_blend_epi32( X1, Y1, 0x3 ); + X2 = _mm_blend_epi32( X2, Y2, 0x3 ); + X3 = _mm_blend_epi32( X3, Y3, 0x3 ); + +#else // SSE4_1 + + Y0 = _mm_blend_epi16( B[1], B[0], 0x03 ); + X0 = _mm_blend_epi16( B[3], B[2], 0x30 ); + Y1 = _mm_blend_epi16( B[2], B[1], 0x03 ); + X1 = _mm_blend_epi16( B[0], B[3], 0x30 ); + Y2 = _mm_blend_epi16( B[3], B[2], 0x03 ); + X2 = _mm_blend_epi16( B[1], B[0], 0x30 ); + Y3 = _mm_blend_epi16( B[0], B[3], 0x03 ); + X3 = _mm_blend_epi16( B[2], B[1], 0x30 ); + + X0 = _mm_blend_epi16( X0, Y0, 0x0f ); + X1 = _mm_blend_epi16( X1, Y1, 0x0f ); + X2 = _mm_blend_epi16( X2, Y2, 0x0f ); + X3 = _mm_blend_epi16( X3, Y3, 0x0f ); + +#endif // AVX2 else SSE4_1 + + SALSA_8ROUNDS_SIMD128; + +#if defined(__AVX2__) + + Y0 = _mm_blend_epi32( X0, X1, 0x8 ); + Y1 = _mm_blend_epi32( X0, X1, 0x1 ); + Y2 = _mm_blend_epi32( X0, X1, 0x2 ); + Y3 = _mm_blend_epi32( X0, X1, 0x4 ); + + Y0 = _mm_blend_epi32( Y0, X2, 0x4 ); + Y1 = _mm_blend_epi32( Y1, X2, 0x8 ); + Y2 = _mm_blend_epi32( Y2, X2, 0x1 ); + Y3 = _mm_blend_epi32( Y3, X2, 0x2 ); + + Y0 = _mm_blend_epi32( Y0, X3, 0x2 ); + Y1 = _mm_blend_epi32( Y1, X3, 0x4 ); + Y2 = _mm_blend_epi32( Y2, X3, 0x8 ); + Y3 = _mm_blend_epi32( Y3, X3, 0x1 ); + +#else // SSE4_1 + + Y0 = _mm_blend_epi16( X0, X1, 0xc0 ); + Y1 = _mm_blend_epi16( X0, X1, 0x03 ); + Y2 = _mm_blend_epi16( X0, X1, 0x0c ); + Y3 = _mm_blend_epi16( X0, X1, 0x30 ); + + Y0 = _mm_blend_epi16( Y0, X2, 0x30 ); + Y1 = _mm_blend_epi16( Y1, X2, 0xc0 ); + Y2 = _mm_blend_epi16( Y2, X2, 0x03 ); + Y3 = _mm_blend_epi16( Y3, X2, 0x0c ); + + Y0 = _mm_blend_epi16( Y0, X3, 0x0c ); + Y1 = _mm_blend_epi16( Y1, X3, 0x30 ); + Y2 = _mm_blend_epi16( Y2, X3, 0xc0 ); + Y3 = _mm_blend_epi16( Y3, X3, 0x03 ); + +#endif // AVX2 else SSE4_1 + + B[0] = _mm_add_epi32( B[0], Y0 ); + B[1] = _mm_add_epi32( B[1], Y1 ); + B[2] = _mm_add_epi32( B[2], Y2 ); + B[3] = _mm_add_epi32( B[3], Y3 ); + +#else // SSE2 + + m128_ovly y[4], z[4]; + + X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] ); + X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] ); + X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] ); + X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] ); + + SALSA_8ROUNDS_FINAL_SIMD128; + + // Final round doesn't shuffle data back to original input order, + // process it as is. + // X0 is unchanged { xf, xa, x5, x0 } + // X1 is shuffled left 1 (rol_1x32) { xe, x9, x4, x3 } + // X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 } + // X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 } + + y[0].m128 = X0; + y[1].m128 = X1; + y[2].m128 = X2; + y[3].m128 = X3; + + z[0].u32[0] = y[0].u32[0]; + z[0].u32[3] = y[1].u32[0]; + z[0].u32[2] = y[2].u32[0]; + z[0].u32[1] = y[3].u32[0]; + + z[1].u32[1] = y[0].u32[1]; + z[1].u32[0] = y[1].u32[1]; + z[1].u32[3] = y[2].u32[1]; + z[1].u32[2] = y[3].u32[1]; + + z[2].u32[2] = y[0].u32[2]; + z[2].u32[1] = y[1].u32[2]; + z[2].u32[0] = y[2].u32[2]; + z[2].u32[3] = y[3].u32[2]; + + z[3].u32[3] = y[0].u32[3]; + z[3].u32[2] = y[1].u32[3]; + z[3].u32[1] = y[2].u32[3]; + z[3].u32[0] = y[3].u32[3]; + + B[0] = _mm_add_epi32( B[0], z[0].m128 ); + B[1] = _mm_add_epi32( B[1], z[1].m128 ); + B[2] = _mm_add_epi32( B[2], z[2].m128 ); + B[3] = _mm_add_epi32( B[3], z[3].m128 ); + +#endif + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + +} + +void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + for ( int k = 0; k < 8; k++ ) + _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) ); + + salsa8_simd128( &X[ 0], &X[16] ); + salsa8_simd128( &X[16], &X[ 0] ); + } + for ( int i = 0; i < N; i++ ) + { + const int j = 32 * ( X[16] & ( N - 1 ) ); + for ( int k = 0; k < 32; k++ ) + X[k] ^= V[j + k]; + salsa8_simd128( &X[ 0], &X[16] ); + salsa8_simd128( &X[16], &X[ 0] ); + } +} + +// Double buffered, 2x memory usage +// No interleaving +static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, + const uint32_t * const ca, const uint32_t * const cb ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + // mix C into B then shuffle B into X + BA[0] = _mm_xor_si128( BA[0], CA[0] ); + BB[0] = _mm_xor_si128( BB[0], CB[0] ); + BA[1] = _mm_xor_si128( BA[1], CA[1] ); + BB[1] = _mm_xor_si128( BB[1], CB[1] ); + BA[2] = _mm_xor_si128( BA[2], CA[2] ); + BB[2] = _mm_xor_si128( BB[2], CB[2] ); + BA[3] = _mm_xor_si128( BA[3], CA[3] ); + BB[3] = _mm_xor_si128( BB[3], CB[3] ); + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); + YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); + XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); + XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); + + YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); + YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); + XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); + XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); + + YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); + YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); + XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); + XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); + + YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); + YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); + XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); + XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); + + XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); + XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); + + XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); + XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); + + XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); + XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); + + XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); + XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); + YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); + XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); + XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); + + YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); + YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); + XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); + XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); + + YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); + YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); + XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); + XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); + + YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); + YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); + XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); + XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); + + XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); + XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); + + XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); + XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); + + XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); + XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); + + XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); + XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); + +#endif // AVX2 else SSE4_1 + + SALSA_8ROUNDS_SIMD128_2BUF; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); + YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); + YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); + YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); + YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); + YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); + YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); + YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); + + YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); + YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); + YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); + YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); + YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); + YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); + YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); + YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); + YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); + YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); + YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); + YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); + YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); + YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); + YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); + + YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); + YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); + YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); + YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); + YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); + YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); + YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); + YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); + +#endif // AVX2 else SSE4_1 + + BA[0] = _mm_add_epi32( BA[0], YA0 ); + BB[0] = _mm_add_epi32( BB[0], YB0 ); + BA[1] = _mm_add_epi32( BA[1], YA1 ); + BB[1] = _mm_add_epi32( BB[1], YB1 ); + BA[2] = _mm_add_epi32( BA[2], YA2 ); + BB[2] = _mm_add_epi32( BB[2], YB2 ); + BA[3] = _mm_add_epi32( BA[3], YA3 ); + BB[3] = _mm_add_epi32( BB[3], YB3 ); + +#else // SSE2 + + m128_ovly ya[4], za[4], yb[4], zb[4]; + + XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); + XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); + XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); + XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); + XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); + XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); + XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); + XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); + + SALSA_8ROUNDS_FINAL_SIMD128_2BUF; + + // Final round doesn't shuffle data back to original input order, + // process it as is. + + ya[0].m128 = XA0; + yb[0].m128 = XB0; + ya[1].m128 = XA1; + yb[1].m128 = XB1; + ya[2].m128 = XA2; + yb[2].m128 = XB2; + ya[3].m128 = XA3; + yb[3].m128 = XB3; + + za[0].u32[0] = ya[0].u32[0]; + zb[0].u32[0] = yb[0].u32[0]; + za[0].u32[3] = ya[1].u32[0]; + zb[0].u32[3] = yb[1].u32[0]; + za[0].u32[2] = ya[2].u32[0]; + zb[0].u32[2] = yb[2].u32[0]; + za[0].u32[1] = ya[3].u32[0]; + zb[0].u32[1] = yb[3].u32[0]; + + za[1].u32[1] = ya[0].u32[1]; + zb[1].u32[1] = yb[0].u32[1]; + za[1].u32[0] = ya[1].u32[1]; + zb[1].u32[0] = yb[1].u32[1]; + za[1].u32[3] = ya[2].u32[1]; + zb[1].u32[3] = yb[2].u32[1]; + za[1].u32[2] = ya[3].u32[1]; + zb[1].u32[2] = yb[3].u32[1]; + + za[2].u32[2] = ya[0].u32[2]; + zb[2].u32[2] = yb[0].u32[2]; + za[2].u32[1] = ya[1].u32[2]; + zb[2].u32[1] = yb[1].u32[2]; + za[2].u32[0] = ya[2].u32[2]; + zb[2].u32[0] = yb[2].u32[2]; + za[2].u32[3] = ya[3].u32[2]; + zb[2].u32[3] = yb[3].u32[2]; + + za[3].u32[3] = ya[0].u32[3]; + zb[3].u32[3] = yb[0].u32[3]; + za[3].u32[2] = ya[1].u32[3]; + zb[3].u32[2] = yb[1].u32[3]; + za[3].u32[1] = ya[2].u32[3]; + zb[3].u32[1] = yb[2].u32[3]; + za[3].u32[0] = ya[3].u32[3]; + zb[3].u32[0] = yb[3].u32[3]; + + BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); + BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); + BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); + BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); + BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); + BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); + BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); + BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); + +#endif + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE +} + + +// X: 2 sequential buffers +// V: 2 sequential buffers interleaved by the size of N +// interleaved buffers { v00, v01, v10, v11, v20... } +// +void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + + for ( int i = 0; i < N; i++ ) + { + #if defined(__AVX2__) + + for ( int k = 0; k < 4; k++ ) + { + _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) ); + _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) ); + } + + #else + + memcpy( &V0[ i*32 ], X0, 128 ); + memcpy( &V1[ i*32 ], X1, 128 ); + + #endif + + salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N-1 ) ); + const int j1 = 4 * ( X1[16] & ( N-1 ) ); + for ( int k = 0; k < 4; k++ ) + { + const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); + const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); +// const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k ); +// const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k ); + casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); + casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); + } + + #else + + const int j0 = 8 * ( X0[16] & ( N-1 ) ); + const int j1 = 8 * ( X1[16] & ( N-1 ) ); + for ( int k = 0; k < 8; k++ ) + { + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); + casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); + casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); + } + + #endif + +/* + const int j0 = 16 * ( X0[16] & ( N - 1 ) ); + const int j1 = 16 * ( X1[16] & ( N - 1 ) ); + + for ( int k = 0; k < 16; k++ ) + { + const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; + const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; + ( (uint64_t*)X0 )[k] ^= v0; + ( (uint64_t*)X1 )[k] ^= v1; + } +*/ + +/* + const int j0 = 32 * ( X0[16] & ( N-1 ) ); + const int j1 = 32 * ( X1[16] & ( N-1 ) ); + + for ( int k = 0; k < 32; k++ ) + { + const uint32_t v0 = V0[ j0+k ]; + const uint32_t v1 = V1[ j1+k ]; + X0[k] ^= v0; + X1[k] ^= v1; + } +*/ + + salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); + salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + } +} + + +// Triple buffered, 3x memory usage +// No interleaving +static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, + const uint32_t *ca, const uint32_t *cb, const uint32_t *cc ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + __m128i *BC = (__m128i*)bc; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + const __m128i *CC = (const __m128i*)cc; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + // mix C into B then shuffle B into X + BA[0] = _mm_xor_si128( BA[0], CA[0] ); + BB[0] = _mm_xor_si128( BB[0], CB[0] ); + BC[0] = _mm_xor_si128( BC[0], CC[0] ); + BA[1] = _mm_xor_si128( BA[1], CA[1] ); + BB[1] = _mm_xor_si128( BB[1], CB[1] ); + BC[1] = _mm_xor_si128( BC[1], CC[1] ); + BA[2] = _mm_xor_si128( BA[2], CA[2] ); + BB[2] = _mm_xor_si128( BB[2], CB[2] ); + BC[2] = _mm_xor_si128( BC[2], CC[2] ); + BA[3] = _mm_xor_si128( BA[3], CA[3] ); + BB[3] = _mm_xor_si128( BB[3], CB[3] ); + BC[3] = _mm_xor_si128( BC[3], CC[3] ); + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); + YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); + YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 ); + XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); + XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); + XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 ); + + YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); + YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); + YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 ); + XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); + XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); + XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 ); + + YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); + YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); + YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 ); + XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); + XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); + XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 ); + + YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); + YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); + YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 ); + XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); + XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); + XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 ); + + XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); + XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); + XC0 = _mm_blend_epi32( XC0, YC0, 0x3 ); + + XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); + XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); + XC1 = _mm_blend_epi32( XC1, YC1, 0x3 ); + + XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); + XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); + XC2 = _mm_blend_epi32( XC2, YC2, 0x3 ); + + XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); + XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); + XC3 = _mm_blend_epi32( XC3, YC3, 0x3 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); + YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); + YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 ); + XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); + XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); + XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 ); + XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); + XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); + XC0 = _mm_blend_epi16( XC0, YC0, 0x0f ); + + YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); + YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); + YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 ); + XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); + XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); + XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 ); + XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); + XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); + XC1 = _mm_blend_epi16( XC1, YC1, 0x0f ); + + YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); + YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); + YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 ); + XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); + XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); + XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 ); + XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); + XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); + XC2 = _mm_blend_epi16( XC2, YC2, 0x0f ); + + YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); + YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); + YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 ); + XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); + XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); + XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 ); + XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); + XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); + XC3 = _mm_blend_epi16( XC3, YC3, 0x0f ); + +#endif // AVX2 else SSE3_1 + + SALSA_8ROUNDS_SIMD128_3BUF; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); + YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); + YC0 = _mm_blend_epi32( XC0, XC1, 0x8 ); + YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); + YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); + YC1 = _mm_blend_epi32( XC0, XC1, 0x1 ); + YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); + YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); + YC2 = _mm_blend_epi32( XC0, XC1, 0x2 ); + YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); + YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); + YC3 = _mm_blend_epi32( XC0, XC1, 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); + YC0 = _mm_blend_epi32( YC0, XC2, 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); + YC1 = _mm_blend_epi32( YC1, XC2, 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); + YC2 = _mm_blend_epi32( YC2, XC2, 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); + YC3 = _mm_blend_epi32( YC3, XC2, 0x2 ); + + YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); + YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); + YC0 = _mm_blend_epi32( YC0, XC3, 0x2 ); + YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); + YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); + YC1 = _mm_blend_epi32( YC1, XC3, 0x4 ); + YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); + YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); + YC2 = _mm_blend_epi32( YC2, XC3, 0x8 ); + YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); + YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); + YC3 = _mm_blend_epi32( YC3, XC3, 0x1 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); + YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); + YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 ); + YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); + YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); + YC1 = _mm_blend_epi16( XC0, XC1, 0x03 ); + YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); + YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); + YC2 = _mm_blend_epi16( XC0, XC1, 0x0c ); + YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); + YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); + YC3 = _mm_blend_epi16( XC0, XC1, 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); + YC0 = _mm_blend_epi16( YC0, XC2, 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); + YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); + YC2 = _mm_blend_epi16( YC2, XC2, 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); + YC3 = _mm_blend_epi16( YC3, XC2, 0x0c ); + + YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); + YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); + YC0 = _mm_blend_epi16( YC0, XC3, 0x0c ); + YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); + YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); + YC1 = _mm_blend_epi16( YC1, XC3, 0x30 ); + YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); + YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); + YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 ); + YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); + YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); + YC3 = _mm_blend_epi16( YC3, XC3, 0x03 ); + +#endif // AVX2 else SSE4_1 + + BA[0] = _mm_add_epi32( BA[0], YA0 ); + BB[0] = _mm_add_epi32( BB[0], YB0 ); + BC[0] = _mm_add_epi32( BC[0], YC0 ); + BA[1] = _mm_add_epi32( BA[1], YA1 ); + BB[1] = _mm_add_epi32( BB[1], YB1 ); + BC[1] = _mm_add_epi32( BC[1], YC1 ); + BA[2] = _mm_add_epi32( BA[2], YA2 ); + BB[2] = _mm_add_epi32( BB[2], YB2 ); + BC[2] = _mm_add_epi32( BC[2], YC2 ); + BA[3] = _mm_add_epi32( BA[3], YA3 ); + BB[3] = _mm_add_epi32( BB[3], YB3 ); + BC[3] = _mm_add_epi32( BC[3], YC3 ); + +#else // SSE2 + + m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4]; + + XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); + XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); + XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] ); + XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); + XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); + XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] ); + XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); + XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); + XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] ); + XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); + XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); + XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] ); + + SALSA_8ROUNDS_FINAL_SIMD128_3BUF; + + // Final round doesn't shuffle data back to original input order, + // process it as is. + + ya[0].m128 = XA0; + yb[0].m128 = XB0; + yc[0].m128 = XC0; + ya[1].m128 = XA1; + yb[1].m128 = XB1; + yc[1].m128 = XC1; + ya[2].m128 = XA2; + yb[2].m128 = XB2; + yc[2].m128 = XC2; + ya[3].m128 = XA3; + yb[3].m128 = XB3; + yc[3].m128 = XC3; + + za[0].u32[0] = ya[0].u32[0]; + zb[0].u32[0] = yb[0].u32[0]; + zc[0].u32[0] = yc[0].u32[0]; + za[0].u32[3] = ya[1].u32[0]; + zb[0].u32[3] = yb[1].u32[0]; + zc[0].u32[3] = yc[1].u32[0]; + za[0].u32[2] = ya[2].u32[0]; + zb[0].u32[2] = yb[2].u32[0]; + zc[0].u32[2] = yc[2].u32[0]; + za[0].u32[1] = ya[3].u32[0]; + zb[0].u32[1] = yb[3].u32[0]; + zc[0].u32[1] = yc[3].u32[0]; + + za[1].u32[1] = ya[0].u32[1]; + zb[1].u32[1] = yb[0].u32[1]; + zc[1].u32[1] = yc[0].u32[1]; + za[1].u32[0] = ya[1].u32[1]; + zb[1].u32[0] = yb[1].u32[1]; + zc[1].u32[0] = yc[1].u32[1]; + za[1].u32[3] = ya[2].u32[1]; + zb[1].u32[3] = yb[2].u32[1]; + zc[1].u32[3] = yc[2].u32[1]; + za[1].u32[2] = ya[3].u32[1]; + zb[1].u32[2] = yb[3].u32[1]; + zc[1].u32[2] = yc[3].u32[1]; + + za[2].u32[2] = ya[0].u32[2]; + zb[2].u32[2] = yb[0].u32[2]; + zc[2].u32[2] = yc[0].u32[2]; + za[2].u32[1] = ya[1].u32[2]; + zb[2].u32[1] = yb[1].u32[2]; + zc[2].u32[1] = yc[1].u32[2]; + za[2].u32[0] = ya[2].u32[2]; + zb[2].u32[0] = yb[2].u32[2]; + zc[2].u32[0] = yc[2].u32[2]; + za[2].u32[3] = ya[3].u32[2]; + zb[2].u32[3] = yb[3].u32[2]; + zc[2].u32[3] = yc[3].u32[2]; + + za[3].u32[3] = ya[0].u32[3]; + zb[3].u32[3] = yb[0].u32[3]; + zc[3].u32[3] = yc[0].u32[3]; + za[3].u32[2] = ya[1].u32[3]; + zb[3].u32[2] = yb[1].u32[3]; + zc[3].u32[2] = yc[1].u32[3]; + za[3].u32[1] = ya[2].u32[3]; + zb[3].u32[1] = yb[2].u32[3]; + zc[3].u32[1] = yc[2].u32[3]; + za[3].u32[0] = ya[3].u32[3]; + zb[3].u32[0] = yb[3].u32[3]; + zc[3].u32[0] = yc[3].u32[3]; + + BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); + BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); + BC[0] = _mm_add_epi32( BC[0], zc[0].m128 ); + BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); + BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); + BC[1] = _mm_add_epi32( BC[1], zc[1].m128 ); + BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); + BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); + BC[2] = _mm_add_epi32( BC[2], zc[2].m128 ); + BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); + BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); + BC[3] = _mm_add_epi32( BC[3], zc[3].m128 ); + +#endif + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE +} + +void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *X2 = X+64; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + uint32_t *V2 = V + 64*N; + + for ( int i = 0; i < N; i++ ) + { + #if defined(__AVX2__) + + for ( int k = 0; k < 4; k++ ) + { + _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) ); + _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) ); + _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) ); + } + + #else + + memcpy( &V0[ i*32 ], X0, 128 ); + memcpy( &V1[ i*32 ], X1, 128 ); + memcpy( &V2[ i*32 ], X2, 128 ); + + #endif + + salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } + + for ( int i = 0; i < N; i++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N - 1 ) ); + const int j1 = 4 * ( X1[16] & ( N - 1 ) ); + const int j2 = 4 * ( X2[16] & ( N - 1 ) ); + + for ( int k = 0; k < 4; k++ ) + { + const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); + const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); + const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k ); +// const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k ); +// const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k ); +// const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k ); + casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); + casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); + casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 ); + } + + #else + + const int j0 = 8 * ( X0[16] & ( N - 1 ) ); + const int j1 = 8 * ( X1[16] & ( N - 1 ) ); + const int j2 = 8 * ( X2[16] & ( N - 1 ) ); + for ( int k = 0; k < 8; k++ ) + { + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); + const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k ); + casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); + casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); + casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 ); + } + + #endif + +/* + const int j0 = 16 * ( X0[16] & ( N - 1 ) ); + const int j1 = 16 * ( X1[16] & ( N - 1 ) ); + const int j2 = 16 * ( X2[16] & ( N - 1 ) ); + + for ( int k = 0; k < 16; k++ ) + { + const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; + const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; + const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ]; + ( (uint64_t*)X0 )[k] ^= v0; + ( (uint64_t*)X1 )[k] ^= v1; + ( (uint64_t*)X2 )[k] ^= v2; + } +*/ + +/* + const int j0 = 32 * ( X0[16] & ( N - 1 ) ); + const int j1 = 32 * ( X1[16] & ( N - 1 ) ); + const int j2 = 32 * ( X2[16] & ( N - 1 ) ); + + for ( int k = 0; k < 32; k++ ) + { + const uint32_t v0 = V0[ j0+k ]; + const uint32_t v1 = V1[ j1+k ]; + const uint32_t v2 = V2[ j2+k ]; + X0[k] ^= v0; + X1[k] ^= v1; + X2[k] ^= v2; + } +*/ + + salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0], + &X0[16], &X1[16], &X2[16] ); + salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16], + &X0[ 0], &X1[ 0], &X2[ 0] ); + } +} + +// Working. +// Quadruple buffered, 4x memory usage +// No interleaving +static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, + uint32_t *bd, const uint32_t *ca, const uint32_t *cb, + const uint32_t *cc, const uint32_t *cd ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + __m128i *BC = (__m128i*)bc; + __m128i *BD = (__m128i*)bd; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + const __m128i *CC = (const __m128i*)cc; + const __m128i *CD = (const __m128i*)cd; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + // mix C into B then shuffle B into X + BA[0] = _mm_xor_si128( BA[0], CA[0] ); + BB[0] = _mm_xor_si128( BB[0], CB[0] ); + BC[0] = _mm_xor_si128( BC[0], CC[0] ); + BD[0] = _mm_xor_si128( BD[0], CD[0] ); + BA[1] = _mm_xor_si128( BA[1], CA[1] ); + BB[1] = _mm_xor_si128( BB[1], CB[1] ); + BC[1] = _mm_xor_si128( BC[1], CC[1] ); + BD[1] = _mm_xor_si128( BD[1], CD[1] ); + BA[2] = _mm_xor_si128( BA[2], CA[2] ); + BB[2] = _mm_xor_si128( BB[2], CB[2] ); + BC[2] = _mm_xor_si128( BC[2], CC[2] ); + BD[2] = _mm_xor_si128( BD[2], CD[2] ); + BA[3] = _mm_xor_si128( BA[3], CA[3] ); + BB[3] = _mm_xor_si128( BB[3], CB[3] ); + BC[3] = _mm_xor_si128( BC[3], CC[3] ); + BD[3] = _mm_xor_si128( BD[3], CD[3] ); + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, + YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); + YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); + YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 ); + YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 ); + XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); + XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); + XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 ); + XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 ); + + YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); + YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); + YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 ); + YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 ); + XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); + XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); + XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 ); + XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 ); + + YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); + YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); + YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 ); + YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 ); + XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); + XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); + XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 ); + XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 ); + + YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); + YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); + YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 ); + YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 ); + XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); + XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); + XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 ); + XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 ); + + XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); + XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); + XC0 = _mm_blend_epi32( XC0, YC0, 0x3 ); + XD0 = _mm_blend_epi32( XD0, YD0, 0x3 ); + + XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); + XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); + XC1 = _mm_blend_epi32( XC1, YC1, 0x3 ); + XD1 = _mm_blend_epi32( XD1, YD1, 0x3 ); + + XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); + XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); + XC2 = _mm_blend_epi32( XC2, YC2, 0x3 ); + XD2 = _mm_blend_epi32( XD2, YD2, 0x3 ); + + XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); + XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); + XC3 = _mm_blend_epi32( XC3, YC3, 0x3 ); + XD3 = _mm_blend_epi32( XD3, YD3, 0x3 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); + YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); + YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 ); + YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 ); + XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); + XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); + XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 ); + XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 ); + XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); + XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); + XC0 = _mm_blend_epi16( XC0, YC0, 0x0f ); + XD0 = _mm_blend_epi16( XD0, YD0, 0x0f ); + + YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); + YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); + YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 ); + YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 ); + XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); + XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); + XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 ); + XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 ); + XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); + XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); + XC1 = _mm_blend_epi16( XC1, YC1, 0x0f ); + XD1 = _mm_blend_epi16( XD1, YD1, 0x0f ); + + YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); + YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); + YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 ); + YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 ); + XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); + XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); + XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 ); + XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 ); + XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); + XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); + XC2 = _mm_blend_epi16( XC2, YC2, 0x0f ); + XD2 = _mm_blend_epi16( XD2, YD2, 0x0f ); + + YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); + YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); + YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 ); + YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 ); + XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); + XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); + XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 ); + XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 ); + XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); + XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); + XC3 = _mm_blend_epi16( XC3, YC3, 0x0f ); + XD3 = _mm_blend_epi16( XD3, YD3, 0x0f ); + +#endif // AVX2 else SSE3_1 + + SALSA_8ROUNDS_SIMD128_4BUF; + +#if defined(__AVX2__) + + YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); + YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); + YC0 = _mm_blend_epi32( XC0, XC1, 0x8 ); + YD0 = _mm_blend_epi32( XD0, XD1, 0x8 ); + YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); + YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); + YC1 = _mm_blend_epi32( XC0, XC1, 0x1 ); + YD1 = _mm_blend_epi32( XD0, XD1, 0x1 ); + YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); + YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); + YC2 = _mm_blend_epi32( XC0, XC1, 0x2 ); + YD2 = _mm_blend_epi32( XD0, XD1, 0x2 ); + YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); + YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); + YC3 = _mm_blend_epi32( XC0, XC1, 0x4 ); + YD3 = _mm_blend_epi32( XD0, XD1, 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); + YC0 = _mm_blend_epi32( YC0, XC2, 0x4 ); + YD0 = _mm_blend_epi32( YD0, XD2, 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); + YC1 = _mm_blend_epi32( YC1, XC2, 0x8 ); + YD1 = _mm_blend_epi32( YD1, XD2, 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); + YC2 = _mm_blend_epi32( YC2, XC2, 0x1 ); + YD2 = _mm_blend_epi32( YD2, XD2, 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); + YC3 = _mm_blend_epi32( YC3, XC2, 0x2 ); + YD3 = _mm_blend_epi32( YD3, XD2, 0x2 ); + + YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); + YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); + YC0 = _mm_blend_epi32( YC0, XC3, 0x2 ); + YD0 = _mm_blend_epi32( YD0, XD3, 0x2 ); + YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); + YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); + YC1 = _mm_blend_epi32( YC1, XC3, 0x4 ); + YD1 = _mm_blend_epi32( YD1, XD3, 0x4 ); + YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); + YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); + YC2 = _mm_blend_epi32( YC2, XC3, 0x8 ); + YD2 = _mm_blend_epi32( YD2, XD3, 0x8 ); + YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); + YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); + YC3 = _mm_blend_epi32( YC3, XC3, 0x1 ); + YD3 = _mm_blend_epi32( YD3, XD3, 0x1 ); + +#else // SSE4_1 + + YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); + YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); + YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 ); + YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 ); + YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); + YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); + YC1 = _mm_blend_epi16( XC0, XC1, 0x03 ); + YD1 = _mm_blend_epi16( XD0, XD1, 0x03 ); + YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); + YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); + YC2 = _mm_blend_epi16( XC0, XC1, 0x0c ); + YD2 = _mm_blend_epi16( XD0, XD1, 0x0c ); + YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); + YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); + YC3 = _mm_blend_epi16( XC0, XC1, 0x30 ); + YD3 = _mm_blend_epi16( XD0, XD1, 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); + YC0 = _mm_blend_epi16( YC0, XC2, 0x30 ); + YD0 = _mm_blend_epi16( YD0, XD2, 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); + YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 ); + YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); + YC2 = _mm_blend_epi16( YC2, XC2, 0x03 ); + YD2 = _mm_blend_epi16( YD2, XD2, 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); + YC3 = _mm_blend_epi16( YC3, XC2, 0x0c ); + YD3 = _mm_blend_epi16( YD3, XD2, 0x0c ); + + YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); + YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); + YC0 = _mm_blend_epi16( YC0, XC3, 0x0c ); + YD0 = _mm_blend_epi16( YD0, XD3, 0x0c ); + YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); + YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); + YC1 = _mm_blend_epi16( YC1, XC3, 0x30 ); + YD1 = _mm_blend_epi16( YD1, XD3, 0x30 ); + YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); + YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); + YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 ); + YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 ); + YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); + YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); + YC3 = _mm_blend_epi16( YC3, XC3, 0x03 ); + YD3 = _mm_blend_epi16( YD3, XD3, 0x03 ); + +#endif // AVX2 else SSE4_1 + + BA[0] = _mm_add_epi32( BA[0], YA0 ); + BB[0] = _mm_add_epi32( BB[0], YB0 ); + BC[0] = _mm_add_epi32( BC[0], YC0 ); + BD[0] = _mm_add_epi32( BD[0], YD0 ); + BA[1] = _mm_add_epi32( BA[1], YA1 ); + BB[1] = _mm_add_epi32( BB[1], YB1 ); + BC[1] = _mm_add_epi32( BC[1], YC1 ); + BD[1] = _mm_add_epi32( BD[1], YD1 ); + BA[2] = _mm_add_epi32( BA[2], YA2 ); + BB[2] = _mm_add_epi32( BB[2], YB2 ); + BC[2] = _mm_add_epi32( BC[2], YC2 ); + BD[2] = _mm_add_epi32( BD[2], YD2 ); + BA[3] = _mm_add_epi32( BA[3], YA3 ); + BB[3] = _mm_add_epi32( BB[3], YB3 ); + BC[3] = _mm_add_epi32( BC[3], YC3 ); + BD[3] = _mm_add_epi32( BD[3], YD3 ); + +#else // SSE2 + + m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4]; + + XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); + XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); + XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] ); + XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] ); + XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); + XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); + XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] ); + XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] ); + XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); + XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); + XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] ); + XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] ); + XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); + XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); + XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] ); + XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] ); + + SALSA_8ROUNDS_FINAL_SIMD128_4BUF; + + ya[0].m128 = XA0; + yb[0].m128 = XB0; + yc[0].m128 = XC0; + yd[0].m128 = XD0; + ya[1].m128 = XA1; + yb[1].m128 = XB1; + yc[1].m128 = XC1; + yd[1].m128 = XD1; + ya[2].m128 = XA2; + yb[2].m128 = XB2; + yc[2].m128 = XC2; + yd[2].m128 = XD2; + ya[3].m128 = XA3; + yb[3].m128 = XB3; + yc[3].m128 = XC3; + yd[3].m128 = XD3; + + za[0].u32[0] = ya[0].u32[0]; + zb[0].u32[0] = yb[0].u32[0]; + zc[0].u32[0] = yc[0].u32[0]; + zd[0].u32[0] = yd[0].u32[0]; + za[0].u32[3] = ya[1].u32[0]; + zb[0].u32[3] = yb[1].u32[0]; + zc[0].u32[3] = yc[1].u32[0]; + zd[0].u32[3] = yd[1].u32[0]; + za[0].u32[2] = ya[2].u32[0]; + zb[0].u32[2] = yb[2].u32[0]; + zc[0].u32[2] = yc[2].u32[0]; + zd[0].u32[2] = yd[2].u32[0]; + za[0].u32[1] = ya[3].u32[0]; + zb[0].u32[1] = yb[3].u32[0]; + zc[0].u32[1] = yc[3].u32[0]; + zd[0].u32[1] = yd[3].u32[0]; + + za[1].u32[1] = ya[0].u32[1]; + zb[1].u32[1] = yb[0].u32[1]; + zc[1].u32[1] = yc[0].u32[1]; + zd[1].u32[1] = yd[0].u32[1]; + za[1].u32[0] = ya[1].u32[1]; + zb[1].u32[0] = yb[1].u32[1]; + zc[1].u32[0] = yc[1].u32[1]; + zd[1].u32[0] = yd[1].u32[1]; + za[1].u32[3] = ya[2].u32[1]; + zb[1].u32[3] = yb[2].u32[1]; + zc[1].u32[3] = yc[2].u32[1]; + zd[1].u32[3] = yd[2].u32[1]; + za[1].u32[2] = ya[3].u32[1]; + zb[1].u32[2] = yb[3].u32[1]; + zc[1].u32[2] = yc[3].u32[1]; + zd[1].u32[2] = yd[3].u32[1]; + + za[2].u32[2] = ya[0].u32[2]; + zb[2].u32[2] = yb[0].u32[2]; + zc[2].u32[2] = yc[0].u32[2]; + zd[2].u32[2] = yd[0].u32[2]; + za[2].u32[1] = ya[1].u32[2]; + zb[2].u32[1] = yb[1].u32[2]; + zc[2].u32[1] = yc[1].u32[2]; + zd[2].u32[1] = yd[1].u32[2]; + za[2].u32[0] = ya[2].u32[2]; + zb[2].u32[0] = yb[2].u32[2]; + zc[2].u32[0] = yc[2].u32[2]; + zd[2].u32[0] = yd[2].u32[2]; + za[2].u32[3] = ya[3].u32[2]; + zb[2].u32[3] = yb[3].u32[2]; + zc[2].u32[3] = yc[3].u32[2]; + zd[2].u32[3] = yd[3].u32[2]; + + za[3].u32[3] = ya[0].u32[3]; + zb[3].u32[3] = yb[0].u32[3]; + zc[3].u32[3] = yc[0].u32[3]; + zd[3].u32[3] = yd[0].u32[3]; + za[3].u32[2] = ya[1].u32[3]; + zb[3].u32[2] = yb[1].u32[3]; + zc[3].u32[2] = yc[1].u32[3]; + zd[3].u32[2] = yd[1].u32[3]; + za[3].u32[1] = ya[2].u32[3]; + zb[3].u32[1] = yb[2].u32[3]; + zc[3].u32[1] = yc[2].u32[3]; + zd[3].u32[1] = yd[2].u32[3]; + za[3].u32[0] = ya[3].u32[3]; + zb[3].u32[0] = yb[3].u32[3]; + zc[3].u32[0] = yc[3].u32[3]; + zd[3].u32[0] = yd[3].u32[3]; + + BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); + BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); + BC[0] = _mm_add_epi32( BC[0], zc[0].m128 ); + BD[0] = _mm_add_epi32( BD[0], zd[0].m128 ); + BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); + BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); + BC[1] = _mm_add_epi32( BC[1], zc[1].m128 ); + BD[1] = _mm_add_epi32( BD[1], zd[1].m128 ); + BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); + BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); + BC[2] = _mm_add_epi32( BC[2], zc[2].m128 ); + BD[2] = _mm_add_epi32( BD[2], zd[2].m128 ); + BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); + BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); + BC[3] = _mm_add_epi32( BC[3], zc[3].m128 ); + BD[3] = _mm_add_epi32( BD[3], zd[3].m128 ); + +#endif + + #undef ROL_1X32 + #undef ROR_1X32 + #undef SWAP_64 + #undef ROL32 + #undef ADD32 + #undef XOR + #undef TYPE +} + +void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *X2 = X+64; + uint32_t *X3 = X+96; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + uint32_t *V2 = V + 64*N; + uint32_t *V3 = V + 96*N; + + for ( int i = 0; i < N; i++ ) + { + for ( int k = 0; k < 8; k++ ) + { + _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) ); + _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) ); + _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) ); + _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) ); + } + + salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], + &X0[16], &X1[16], &X2[16], &X3[16] ); + salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16], + &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); + } + for ( int i = 0; i < N; i++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N - 1 ) ); + const int j1 = 4 * ( X1[16] & ( N - 1 ) ); + const int j2 = 4 * ( X2[16] & ( N - 1 ) ); + const int j3 = 4 * ( X3[16] & ( N - 1 ) ); + + for ( int k = 0; k < 4; k++ ) + { + const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); + const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); + const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k ); + const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k ); + casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); + casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); + casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 ); + casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 ); + } + + #else + + const int j0 = 8 * ( X0[16] & ( N - 1 ) ); + const int j1 = 8 * ( X1[16] & ( N - 1 ) ); + const int j2 = 8 * ( X2[16] & ( N - 1 ) ); + const int j3 = 8 * ( X3[16] & ( N - 1 ) ); + + for ( int k = 0; k < 8; k++ ) + { + #if defined(__SSE4_1__) + const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k ); + const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k ); + const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k ); + const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k ); + #else + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); + const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k ); + const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k ); + #endif + casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); + casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); + casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 ); + casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 ); + } + + #endif + +/* + const int j0 = 16 * ( X0[16] & ( N - 1 ) ); + const int j1 = 16 * ( X1[16] & ( N - 1 ) ); + const int j2 = 16 * ( X2[16] & ( N - 1 ) ); + const int j3 = 16 * ( X3[16] & ( N - 1 ) ); + + for ( int k = 0; k < 16; k++ ) + { + const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; + const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; + const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ]; + const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ]; + ( (uint64_t*)X0 )[k] ^= v0; + ( (uint64_t*)X1 )[k] ^= v1; + ( (uint64_t*)X2 )[k] ^= v2; + ( (uint64_t*)X3 )[k] ^= v3; + } +*/ + + salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], + &X0[16], &X1[16], &X2[16], &X3[16] ); + salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16], + &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); + } +} + + +#endif // SSE2 + + +// Reference, used only for testing. +// Tested OK. + +static void xor_salsa8(uint32_t * const B, const uint32_t * const C) +{ + uint32_t x0 = (B[ 0] ^= C[ 0]), + x1 = (B[ 1] ^= C[ 1]), + x2 = (B[ 2] ^= C[ 2]), + x3 = (B[ 3] ^= C[ 3]); + uint32_t x4 = (B[ 4] ^= C[ 4]), + x5 = (B[ 5] ^= C[ 5]), + x6 = (B[ 6] ^= C[ 6]), + x7 = (B[ 7] ^= C[ 7]); + uint32_t x8 = (B[ 8] ^= C[ 8]), + x9 = (B[ 9] ^= C[ 9]), + xa = (B[10] ^= C[10]), + xb = (B[11] ^= C[11]); + uint32_t xc = (B[12] ^= C[12]), + xd = (B[13] ^= C[13]), + xe = (B[14] ^= C[14]), + xf = (B[15] ^= C[15]); + + + #define ROL32( a, c ) ror32( a, c ) + #define ADD32( a, b ) ( (a)+(b) ) + #define XOR( a, b ) ( (a)^(b) ) + + SALSA_8ROUNDS; + + #undef ROL32 + #undef ADD32 + #undef XOR + + B[ 0] += x0; + B[ 1] += x1; + B[ 2] += x2; + B[ 3] += x3; + B[ 4] += x4; + B[ 5] += x5; + B[ 6] += x6; + B[ 7] += x7; + B[ 8] += x8; + B[ 9] += x9; + B[10] += xa; + B[11] += xb; + B[12] += xc; + B[13] += xd; + B[14] += xe; + B[15] += xf; +} + +/** + * @param X input/ouput + * @param V scratch buffer + * @param N factor (def. 1024) + */ + + +void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ) +{ + for ( int i = 0; i < N; i++ ) + { + memcpy( &V[i * 32], X, 128 ); + xor_salsa8( &X[ 0], &X[16] ); + xor_salsa8( &X[16], &X[ 0] ); + } + for ( int i = 0; i < N; i++ ) + { + int j = 32 * ( X[16] & ( N - 1 ) ); + for ( int k = 0; k < 32; k++ ) + X[k] ^= V[j + k]; + xor_salsa8( &X[ 0], &X[16] ); + xor_salsa8( &X[16], &X[ 0] ); + } +} + + + diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h new file mode 100644 index 0000000..6567733 --- /dev/null +++ b/algo/scrypt/scrypt-core-4way.h @@ -0,0 +1,70 @@ +#ifndef SCRYPT_CORE_4WAY_H__ +#define SCRYPT_CORE_4WAY_H__ + +#include "simd-utils.h" +#include +#include + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ); + +// Serial SIMD over 4 way parallel +void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ); + +// 4 way parallel over serial SIMD +void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ); + +#endif + +#if defined(__AVX2__) + +void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N ); + +// 2 way parallel over SIMD128 +void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ); + +// Double buffered 2 way parallel over SIMD128 +void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ); + +// Triplee buffered 2 way parallel over SIMD128 +void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ); + +// Serial SIMD128 over 2 way parallel +void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Double buffered simd over parallel +void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Triple buffered 2 way +void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +// Quadruple buffered +void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N ); + +#endif + +#if defined(__SSE2__) + +// Parallel 4 way, 4x memory +void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ); + +// Linear SIMD 1 way, 1x memory, lowest +void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Double buffered, 2x memory +void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Triple buffered +void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +// Quadruple buffered, 4x memory +void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N ); + +#endif + +// For reference only +void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ); + +#endif + diff --git a/algo/scrypt/scrypt-core-ref.c b/algo/scrypt/scrypt-core-ref.c new file mode 100644 index 0000000..ec564ed --- /dev/null +++ b/algo/scrypt/scrypt-core-ref.c @@ -0,0 +1,206 @@ +#include "scrypt-core-ref.h" + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + +static void xor_salsa8(uint32_t * const B, const uint32_t * const C) +{ + uint32_t x0 = (B[ 0] ^= C[ 0]), + x1 = (B[ 1] ^= C[ 1]), + x2 = (B[ 2] ^= C[ 2]), + x3 = (B[ 3] ^= C[ 3]); + uint32_t x4 = (B[ 4] ^= C[ 4]), + x5 = (B[ 5] ^= C[ 5]), + x6 = (B[ 6] ^= C[ 6]), + x7 = (B[ 7] ^= C[ 7]); + uint32_t x8 = (B[ 8] ^= C[ 8]), + x9 = (B[ 9] ^= C[ 9]), + xa = (B[10] ^= C[10]), + xb = (B[11] ^= C[11]); + uint32_t xc = (B[12] ^= C[12]), + xd = (B[13] ^= C[13]), + xe = (B[14] ^= C[14]), + xf = (B[15] ^= C[15]); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); + x9 ^= ROTL(x5 + x1, 7); + xe ^= ROTL(xa + x6, 7); + x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); + xd ^= ROTL(x9 + x5, 9); + x2 ^= ROTL(xe + xa, 9); + x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); + x1 ^= ROTL(xd + x9, 13); + x6 ^= ROTL(x2 + xe, 13); + xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); + x5 ^= ROTL(x1 + xd, 18); + xa ^= ROTL(x6 + x2, 18); + xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); + x6 ^= ROTL(x5 + x4, 7); + xb ^= ROTL(xa + x9, 7); + xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); + x7 ^= ROTL(x6 + x5, 9); + x8 ^= ROTL(xb + xa, 9); + xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); + x4 ^= ROTL(x7 + x6, 13); + x9 ^= ROTL(x8 + xb, 13); + xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); + x5 ^= ROTL(x4 + x7, 18); + xa ^= ROTL(x9 + x8, 18); + xf ^= ROTL(xe + xd, 18); + + B[ 0] += x0; + B[ 1] += x1; + B[ 2] += x2; + B[ 3] += x3; + B[ 4] += x4; + B[ 5] += x5; + B[ 6] += x6; + B[ 7] += x7; + B[ 8] += x8; + B[ 9] += x9; + B[10] += xa; + B[11] += xb; + B[12] += xc; + B[13] += xd; + B[14] += xe; + B[15] += xf; +} + +/** + * @param X input/ouput + * @param V scratch buffer + * @param N factor (def. 1024) + */ +void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N) +{ + for (uint32_t i = 0; i < N; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (uint32_t i = 0; i < N; i++) { + uint32_t j = 32 * (X[16] & (N - 1)); + for (uint8_t k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index e35adbf..a15b5cb 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -32,6 +32,9 @@ #include #include #include +#include "algo/sha/sha-hash-4way.h" +#include "algo/sha/sha256-hash.h" +#include static const uint32_t keypad[12] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 @@ -46,81 +49,103 @@ static const uint32_t finalblk[16] = { 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 }; -static __thread char *scratchbuf; -int scratchbuf_size = 0; +static const uint32_t sha256_initial_state[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static int scrypt_throughput = 0; + +static int scratchbuf_size = 0; + +static __thread char *scratchbuf = NULL; + +// change this to a constant to be used directly as input state arg +// vectors still need an init function. +static inline void sha256_init_state( uint32_t *state ) +{ + state[ 0 ] = 0x6A09E667; + state[ 1 ] = 0xBB67AE85; + state[ 2 ] = 0x3C6EF372; + state[ 3 ] = 0xA54FF53A; + state[ 4 ] = 0x510E527F; + state[ 5 ] = 0x9B05688C; + state[ 6 ] = 0x1F83D9AB; + state[ 7 ] = 0x5BE0CD19; +} static inline void HMAC_SHA256_80_init(const uint32_t *key, uint32_t *tstate, uint32_t *ostate) { - uint32_t ihash[8]; - uint32_t pad[16]; - int i; + uint32_t ihash[8]; + uint32_t pad[16]; + int i; - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); + sha256_transform_le( tstate, pad, tstate ); - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); + memcpy( ihash, tstate, 32 ); + + for ( i = 0; i < 8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_transform_le( ostate, pad, sha256_initial_state ); + + for ( i = 0; i < 8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 16; i++ ) pad[i] = 0x36363636; + + sha256_transform_le( tstate, pad, sha256_initial_state ); } static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i, j; - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); + sha256_transform_le( istate, salt, tstate ); - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); + memcpy(ibuf, salt + 16, 16); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } + for (i = 0; i < 4; i++) + { + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + + sha256_transform_le( obuf, ibuf, obuf ); + sha256_transform_le( ostate2, obuf, ostate ); + + for (j = 0; j < 8; j++) + output[8 * i + j] = bswap_32( ostate2[j] ); + } } static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); + uint32_t buf[16]; + int i; - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); + sha256_transform_be( tstate, salt, tstate ); + sha256_transform_be( tstate, salt+16, tstate ); + sha256_transform_le( tstate, finalblk, tstate ); + + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); + + sha256_transform_le( ostate, buf, ostate ); + + for (i = 0; i < 8; i++) + output[i] = bswap_32( ostate[i] ); } - #ifdef HAVE_SHA256_4WAY static const uint32_t keypad_4way[4 * 12] = { @@ -160,6 +185,8 @@ static const uint32_t outerpad_4way[4 * 8] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000300, 0x00000300, 0x00000300, 0x00000300 }; + +/* static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x80000000, 0x80000000, 0x80000000, 0x80000000, @@ -178,37 +205,51 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000620, 0x00000620, 0x00000620, 0x00000620 }; +*/ -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) +static inline void sha256_4way_init_state( void *state ) +{ + casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 ); + casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 ); + casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 ); + casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A ); + casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F ); + casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C ); + casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB ); + casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_4way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) { uint32_t _ALIGN(16) ihash[4 * 8]; uint32_t _ALIGN(16) pad[4 * 16]; int i; /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); + memcpy( pad, key + 4*16, 4*16 ); + memcpy( pad + 4*4, keypad_4way, 4*48 ); - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); + sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad, + (const __m128i*)tstate ); - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); + sha256_4way_init_state( tstate ); + + for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad, + (const __m128i*)tstate ); + + for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 4*16; i++ ) pad[i] = 0x36363636; + + sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad, + (const __m128i*)tstate ); } -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { uint32_t _ALIGN(16) istate[4 * 8]; uint32_t _ALIGN(16) ostate2[4 * 8]; @@ -216,43 +257,62 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, uint32_t _ALIGN(16) obuf[4 * 16]; int i, j; - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); + sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt, + (const __m128i*)tstate ); memcpy(ibuf, salt + 4 * 16, 4 * 16); memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); + for ( i = 0; i < 4; i++ ) + { ibuf[4 * 4 + 0] = i + 1; ibuf[4 * 4 + 1] = i + 1; ibuf[4 * 4 + 2] = i + 1; ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); + sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf, + (const __m128i*)istate ); + + sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf, + (const __m128i*)ostate ); + + for ( j = 0; j < 4 * 8; j++ ) + output[4 * 8 * i + j] = bswap_32( ostate2[j] ); } } -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { - uint32_t _ALIGN(16) buf[4 * 16]; + __m128i _ALIGN(64) final[ 8*16 ]; + uint32_t _ALIGN(64) buf[4 * 16]; int i; - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); + sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt, + (const __m128i*)tstate ); + sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16), + (const __m128i*)tstate ); + + final[ 0] = _mm_set1_epi32( 0x00000001 ); + final[ 1] = _mm_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm_setzero_si128(); + final[15] = _mm_set1_epi32 ( 0x00000620 ); + + sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final, + (const __m128i*)tstate ); + + memcpy(buf, tstate, 4 * 32); memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); + sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf, + (const __m128i*)ostate ); + + for ( i = 0; i < 4 * 8; i++ ) + output[i] = bswap_32( ostate[i] ); } #endif /* HAVE_SHA256_4WAY */ @@ -260,6 +320,7 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, #ifdef HAVE_SHA256_8WAY +/* static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, @@ -278,41 +339,52 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 }; +*/ -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) +static inline void sha256_8way_init_state( void *state ) +{ + casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 ); + casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 ); + casti_m256i( state, 2 ) = _mm256_set1_epi32( 0x3C6EF372 ); + casti_m256i( state, 3 ) = _mm256_set1_epi32( 0xA54FF53A ); + casti_m256i( state, 4 ) = _mm256_set1_epi32( 0x510E527F ); + casti_m256i( state, 5 ) = _mm256_set1_epi32( 0x9B05688C ); + casti_m256i( state, 6 ) = _mm256_set1_epi32( 0x1F83D9AB ); + casti_m256i( state, 7 ) = _mm256_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_8way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) { uint32_t _ALIGN(32) ihash[8 * 8]; uint32_t _ALIGN(32) pad[8 * 16]; int i; - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); + memcpy( pad, key + 8*16, 8*16 ); + for ( i = 0; i < 8; i++ ) pad[ 8*4 + i ] = 0x80000000; + memset( pad + 8*5, 0x00, 8*40 ); + for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280; + + sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad, + (const __m256i*)tstate ); + + sha256_8way_init_state( tstate ); + + for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad, + (const __m256i*)tstate ); + + for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 8*16; i++ ) pad[i] = 0x36363636; + + sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad, + (const __m256i*)tstate ); } -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { uint32_t _ALIGN(32) istate[8 * 8]; uint32_t _ALIGN(32) ostate2[8 * 8]; @@ -320,24 +392,20 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, uint32_t _ALIGN(32) obuf[8 * 16]; int i, j; - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); + sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt, + (const __m256i*)tstate ); + + memcpy( ibuf, salt + 8*16, 8*16 ); + for ( i = 0; i < 8; i++ ) ibuf[ 8*5 + i ] = 0x80000000; + memset( ibuf + 8*6, 0x00, 8*36 ); + for ( i = 0; i < 8; i++ ) ibuf[ 8*15 + i ] = 0x000004a0; - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; + for ( i = 0; i < 8; i++ ) obuf[ 8*8 + i ] = 0x80000000; + memset( obuf + 8*9, 0x00, 8*24 ); + for ( i = 0; i < 8; i++ ) obuf[ 8*15 + i ] = 0x00000300; - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); + for ( i = 0; i < 4; i++ ) + { ibuf[8 * 4 + 0] = i + 1; ibuf[8 * 4 + 1] = i + 1; ibuf[8 * 4 + 2] = i + 1; @@ -346,48 +414,198 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, ibuf[8 * 4 + 5] = i + 1; ibuf[8 * 4 + 6] = i + 1; ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); + + sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf, + (const __m256i*)istate ); + + sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf, + (const __m256i*)ostate ); + + for ( j = 0; j < 8*8; j++ ) + output[ 8*8*i + j ] = bswap_32( ostate2[j] ); } } -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { - uint32_t _ALIGN(32) buf[8 * 16]; + __m256i _ALIGN(128) final[ 8*16 ]; + uint32_t _ALIGN(128) buf[ 8*16 ]; int i; - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - + sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt, + (const __m256i*)tstate ); + sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16), + (const __m256i*)tstate ); + + final[ 0] = _mm256_set1_epi32( 0x00000001 ); + final[ 1] = _mm256_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm256_setzero_si256(); + final[15] = _mm256_set1_epi32 ( 0x00000620 ); + + sha256_8way_transform_le( (__m256i*)tstate, final, + (const __m256i*)tstate ); + + memcpy( buf, tstate, 8*32 ); + for ( i = 0; i < 8; i++ ) buf[ 8*8 + i ] = 0x80000000; + memset( buf + 8*9, 0x00, 8*24 ); + for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300; + + sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf, + (const __m256i*)ostate ); + for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); + output[i] = bswap_32(ostate[i]); } #endif /* HAVE_SHA256_8WAY */ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +static inline void sha256_16way_init_state( void *state ) +{ + casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 ); + casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 ); + casti_m512i( state, 2 ) = _mm512_set1_epi32( 0x3C6EF372 ); + casti_m512i( state, 3 ) = _mm512_set1_epi32( 0xA54FF53A ); + casti_m512i( state, 4 ) = _mm512_set1_epi32( 0x510E527F ); + casti_m512i( state, 5 ) = _mm512_set1_epi32( 0x9B05688C ); + casti_m512i( state, 6 ) = _mm512_set1_epi32( 0x1F83D9AB ); + casti_m512i( state, 7 ) = _mm512_set1_epi32( 0x5BE0CD19 ); +} + +static inline void HMAC_SHA256_80_init_16way( const uint32_t *key, + uint32_t *tstate, uint32_t *ostate ) +{ + uint32_t _ALIGN(128) pad[16*16]; + uint32_t _ALIGN(128) ihash[16* 8]; + int i; + + memcpy( pad, key + 16*16, 16*16 ); + for ( i = 0; i < 16; i++ ) pad[ 16*4 + i ] = 0x80000000; + memset( pad + 16*5, 0x00, 16*40 ); + for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280; + + sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad, + (const __m512i*)tstate ); + + sha256_16way_init_state( tstate ); + + for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; + for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c; + + sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad, + (const __m512i*)tstate ); + + for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; + for ( ; i < 16*16; i++ ) pad[i] = 0x36363636; + + sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad, + (const __m512i*)tstate ); +} + + +static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output ) +{ + uint32_t _ALIGN(128) ibuf[ 16*16 ]; + uint32_t _ALIGN(128) obuf[ 16*16 ]; + uint32_t _ALIGN(128) istate[ 16*8 ]; + uint32_t _ALIGN(128) ostate2[ 16*8 ]; + int i, j; + + sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt, + (const __m512i*)tstate ); + + memcpy( ibuf, salt + 16*16, 16*16 ); + for ( i = 0; i < 16; i++ ) ibuf[ 16*5 + i ] = 0x80000000; + memset( ibuf + 16*6, 0x00, 16*36 ); + for ( i = 0; i < 16; i++ ) ibuf[ 16*15 + i ] = 0x000004a0; + + for ( i = 0; i < 16; i++ ) obuf[ 16*8 + i ] = 0x80000000; + memset( obuf + 16*9, 0x00, 16*24 ); + for ( i = 0; i < 16; i++ ) obuf[ 16*15 + i ] = 0x00000300; + + for ( i = 0; i < 4; i++ ) + { + ibuf[ 16*4 + 0 ] = i + 1; + ibuf[ 16*4 + 1 ] = i + 1; + ibuf[ 16*4 + 2 ] = i + 1; + ibuf[ 16*4 + 3 ] = i + 1; + ibuf[ 16*4 + 4 ] = i + 1; + ibuf[ 16*4 + 5 ] = i + 1; + ibuf[ 16*4 + 6 ] = i + 1; + ibuf[ 16*4 + 7 ] = i + 1; + ibuf[ 16*4 + 8 ] = i + 1; + ibuf[ 16*4 + 9 ] = i + 1; + ibuf[ 16*4 + 10 ] = i + 1; + ibuf[ 16*4 + 11 ] = i + 1; + ibuf[ 16*4 + 12 ] = i + 1; + ibuf[ 16*4 + 13 ] = i + 1; + ibuf[ 16*4 + 14 ] = i + 1; + ibuf[ 16*4 + 15 ] = i + 1; + + sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf, + (const __m512i*)istate ); + + sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf, + (const __m512i*)ostate ); + + for ( j = 0; j < 16*8; j++ ) + output[ 16*8*i + j ] = bswap_32( ostate2[j] ); + } +} + +static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output ) +{ + __m512i _ALIGN(128) final[ 16*16 ]; + uint32_t _ALIGN(128) buf[ 16*16 ]; + int i; + + sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt, + (const __m512i*)tstate ); + sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16), + (const __m512i*)tstate ); + + final[ 0] = _mm512_set1_epi32( 0x00000001 ); + final[ 1] = _mm512_set1_epi32( 0x80000000 ); + final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] + = final[ 7] = final[ 8] = final[ 9] = final[10] + = final[11] = final[12] = final[13] = final[14] + = _mm512_setzero_si512(); + final[15] = _mm512_set1_epi32 ( 0x00000620 ); + + sha256_16way_transform_le( (__m512i*)tstate, final, + (const __m512i*)tstate ); + + memcpy( buf, tstate, 16*32 ); + for ( i = 0; i < 16; i++ ) buf[ 16*8 + i ] = 0x80000000; + memset( buf + 16*9, 0x00, 16*24 ); + for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300; + + sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf, + (const __m512i*)ostate ); + + for ( i = 0; i < 16*8; i++ ) + output[i] = bswap_32( ostate[i] ); +} + +#endif // AVX512 //#if defined(USE_ASM) && defined(__x86_64__) #define SCRYPT_MAX_WAYS 12 #define HAVE_SCRYPT_3WAY 1 -int scrypt_best_throughput(); +//int scrypt_best_throughput(); void scrypt_core(uint32_t *X, uint32_t *V, int N); void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); -#if defined(USE_AVX2) + +//#if defined(USE_AVX2) +#if defined(__AVX2__) #undef SCRYPT_MAX_WAYS #define SCRYPT_MAX_WAYS 24 #define HAVE_SCRYPT_6WAY 1 @@ -396,261 +614,633 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N); #ifndef SCRYPT_MAX_WAYS #define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 +//#define scrypt_best_throughput() 1 #endif -unsigned char *scrypt_buffer_alloc(int N) -{ - return (uchar*) malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63); -} +#include "scrypt-core-4way.h" -static bool scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, +static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id ) { uint32_t tstate[8], ostate[8]; uint32_t X[32]; - uint32_t *V; + uint32_t *V = (uint32_t*)scratchpad; - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - memcpy(tstate, midstate, 32); HMAC_SHA256_80_init(input, tstate, ostate); PBKDF2_SHA256_80_128(tstate, ostate, input, X); - scrypt_core(X, V, N); + scrypt_core_simd128( X, V, N ); // woring +// scrypt_core_1way( X, V, N ); // working +// scrypt_core(X, V, N); PBKDF2_SHA256_128_32(tstate, ostate, X, output); return true; } -#ifdef HAVE_SHA256_4WAY -static int scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) +#if defined(__AVX2__) + +static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[4 * 8]; - uint32_t _ALIGN(128) ostate[4 * 8]; - uint32_t _ALIGN(128) W[4 * 32]; - uint32_t _ALIGN(128) X[4 * 32]; - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + uint32_t _ALIGN(128) tstate[8 * 8]; + uint32_t _ALIGN(128) ostate[8 * 8]; + uint32_t _ALIGN(128) W[8 * 32]; + uint32_t _ALIGN(128) X[8 * 32]; + uint32_t *V = (uint32_t*)scratchpad; - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; + intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60, + input+80, input+100, input+120, input+140, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m256i( tstate, i ) = _mm256_set1_epi32( midstate[i] ); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_8way( W, tstate, ostate ); + PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W ); - HMAC_SHA256_80_init_4way(W, tstate, ostate); + dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 ); + + + // SCRYPT CORE + + + // AVX512 + +/* + // AVX512 16 way working + intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, 1024 ); + + scrypt_core_16way( (__m512i*)W , (__m512i*)V, N ); + + dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, W, 1024 ); +*/ +/* + // AVX512 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); + scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ +/* + // AVX512, not working, very slow + intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); + dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ + + // AVX2 + +/* + // AVX2 + // disable de/interleave for testing. + scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); +*/ + +/* + // AVX2 working + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + + // working +// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + + // working + scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); + + // working +// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); +*/ + +/* + // AVX2 + intrlv_2x32( W, X , X+ 32, 1024 ); + intrlv_2x32( W+64, X+ 64, X+ 96, 1024 ); + intrlv_2x32( W+128, X+128, X+160, 1024 ); + intrlv_2x32( W+192, X+192, X+224, 1024 ); + + // working, deprecated, not up to data +// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N ); + + // deprecated, not up to date +// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); + + // working +// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); + +// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); + + dintrlv_2x32( X, X+ 32, W, 1024 ); + dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x32( X+128, X+160, W+128, 1024 ); + dintrlv_2x32( X+192, X+224, W+192, 1024 ); +*/ + + // SSE2 + +/* + // SSE2 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); + scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ + +/* + // SSE2 + scrypt_core_simd128( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+160, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+224, V, N ); +*/ +/* + // SSE2 working + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); +*/ + + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + +/* + // SSE2 working + scrypt_core_simd128_4buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4buf( X+128, V, N ); +*/ - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); if ( work_restart[thrid].restart ) return 0; - - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; + intrlv_8x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, 1024 ); + + PBKDF2_SHA256_128_32_8way( tstate, ostate, W, W ); + + dintrlv_8x32( output, output+ 8, output+16, output+24, + output+32, output+40, output+48, output+56, W, 256 ); + + return 1; +} + +#endif // AVX2 + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) +{ + uint32_t _ALIGN(128) tstate[ 16*8 ]; + uint32_t _ALIGN(128) ostate[ 16*8 ]; + uint32_t _ALIGN(128) W[ 16*32 ]; + uint32_t _ALIGN(128) X[ 16*32 ]; + uint32_t *V = (uint32_t*)scratchpad; + + intrlv_16x32( W, input, input+ 20, input+ 40, input+ 60, + input+ 80, input+100, input+120, input+140, + input+160, input+180, input+200, input+220, + input+240, input+260, input+280, input+300, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m512i( tstate, i ) = _mm512_set1_epi32( midstate[i] ); + + HMAC_SHA256_80_init_16way( W, tstate, ostate ); + PBKDF2_SHA256_80_128_16way( tstate, ostate, W, W ); + + dintrlv_16x32( X, X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224, + X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480, + W, 1024 ); + + + // SCRYPT CORE + + + // AVX512 +/* + // AVX512 16 way working + intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, 1024 ); + + scrypt_core_16way( (__m512i*)W , (__m512i*)V, N ); + + dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, + X+256, X+256+32, X+256+64, X+256+96, X+256+128, + X+256+160, X+256+192, X+256+224, W, 1024 ); +*/ +/* + // AVX512 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); + scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ +/* + // AVX512, not working, very slow + intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); + dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ + + // AVX2 + +/* + // AVX2 + // disable de/interleave for testing. + scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); +*/ + +/* + // AVX2 working + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + + // working +// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + + // working + scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); + + // working +// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); +*/ + +/* + // AVX2 + intrlv_2x32( W, X , X+ 32, 1024 ); + intrlv_2x32( W+64, X+ 64, X+ 96, 1024 ); + intrlv_2x32( W+128, X+128, X+160, 1024 ); + intrlv_2x32( W+192, X+192, X+224, 1024 ); + + // working, deprecated, not up to data +// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N ); + + // deprecated, not up to date +// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); + + // working +// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); + +// scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); + + dintrlv_2x32( X, X+ 32, W, 1024 ); + dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x32( X+128, X+160, W+128, 1024 ); + dintrlv_2x32( X+192, X+224, W+192, 1024 ); +*/ + + // SSE2 + +/* + // SSE2 working + intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); + scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); +*/ +/* + // SSE2 + scrypt_core_simd128( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+160, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+224, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+288, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+320, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+384, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+416, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+448, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+480, V, N ); +*/ +/* + // SSE2 working + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+320, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+384, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); +*/ + + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); + +/* + // SSE2 working + scrypt_core_simd128_4buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4buf( X+128, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4buf( X+384, V, N ); +*/ +/* + scrypt_core_3way( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_3way( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); +*/ + + + if ( work_restart[thrid].restart ) return 0; + + intrlv_16x32( W, X, X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224, + X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480, + 1024 ); + + PBKDF2_SHA256_128_32_16way( tstate, ostate, W, W ); + + dintrlv_16x32( output, output+ 8, output+ 16, output+ 24, + output+ 32, output+ 40, output+ 48, output+ 56, + output+ 64, output+ 72, output+ 80, output+ 88, + output+ 96, output+104, output+112, output+120, W, 256 ); + + return 1; +} + + +#endif // AVX512 + +#if defined(__SHA__) + +static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) +{ + uint32_t _ALIGN(128) tstate[4 * 8]; + uint32_t _ALIGN(128) ostate[4 * 8]; + uint32_t _ALIGN(128) W[4 * 32]; + uint32_t *V = (uint32_t*)scratchpad; + + memcpy( tstate, midstate, 32 ); + memcpy( tstate+ 8, midstate, 32 ); + memcpy( tstate+16, midstate, 32 ); + memcpy( tstate+24, midstate, 32 ); + + HMAC_SHA256_80_init( input, tstate, ostate ); + PBKDF2_SHA256_80_128( tstate, ostate, input, W ); + + HMAC_SHA256_80_init( input +20, tstate+ 8, ostate+ 8 ); + PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 ); + + HMAC_SHA256_80_init( input +40, tstate+16, ostate+16 ); + PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 ); + + HMAC_SHA256_80_init( input +60, tstate+24, ostate+24 ); + PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 ); + +/* + // Working Linear single threaded SIMD + scrypt_core_simd128( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+96, V, N ); +*/ + + // working, double buffered linear simd + scrypt_core_simd128_2buf( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( W+64, V, N ); + +/* + scrypt_core_simd128_3buf( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( W+96, V, N ); +*/ + + // working +// scrypt_core_simd128_4buf( W, V, N ); + + if ( work_restart[thrid].restart ) return 0; + + PBKDF2_SHA256_128_32( tstate, ostate, W, output ); + + PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 ); + + PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 ); + + PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 ); + + return 1; +} + +#else + +#ifdef HAVE_SHA256_4WAY +static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) +{ + uint32_t _ALIGN(128) tstate[4 * 8]; + uint32_t _ALIGN(128) ostate[4 * 8]; + uint32_t _ALIGN(128) W[4 * 32]; + uint32_t _ALIGN(128) X[4 * 32]; + uint32_t *V = (uint32_t*)scratchpad; + + intrlv_4x32( W, input, input+20, input+40, input+60, 640 ); + for ( int i = 0; i < 8; i++ ) + casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] ); + + HMAC_SHA256_80_init_4way(W, tstate, ostate); + PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); + + dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); + +////// SCRYPT_CORE + + + // working, simple 4 way parallel, best for scrypt +// scrypt_core_4way( (__m128i*)W, (__m128i*)V, N ); + +/* + // Working Linear single threaded SIMD + scrypt_core_simd128( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+32, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+64, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+96, V, N ); +*/ + + // working, double buffered linear simd, best for n2 + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + +/* + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128( X+96, V, N ); +*/ + + // working +// scrypt_core_simd128_4buf( X, V, N ); + + +/* + // original scrypt_core(X + 0 * 32, V, N); scrypt_core(X + 1 * 32, V, N); scrypt_core(X + 2 * 32, V, N); scrypt_core(X + 3 * 32, V, N); +*/ + +//////////////////////////////// if ( work_restart[thrid].restart ) return 0; - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; + intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; + dintrlv_4x32( output, output+8, output+16, output+24, W, 256 ); return 1; } #endif /* HAVE_SHA256_4WAY */ -#ifdef HAVE_SCRYPT_3WAY - -static int scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) -{ - uint32_t _ALIGN(64) tstate[3 * 8], ostate[3 * 8]; - uint32_t _ALIGN(64) X[3 * 32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - - if ( work_restart[thrid].restart ) return 0; - - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - if ( work_restart[thrid].restart ) return 0; - - scrypt_core_3way(X, V, N); - - if ( work_restart[thrid].restart ) return 0; - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); - - return 1; -} - -#ifdef HAVE_SHA256_4WAY -static bool scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, - int thrid ) -{ - uint32_t _ALIGN(128) tstate[12 * 8]; - uint32_t _ALIGN(128) ostate[12 * 8]; - uint32_t _ALIGN(128) W[12 * 32]; - uint32_t _ALIGN(128) X[12 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - - if ( work_restart[thrid].restart ) return 0; - - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - - if ( work_restart[thrid].restart ) return 0; - - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - - scrypt_core_3way(X + 0 * 96, V, N); - scrypt_core_3way(X + 1 * 96, V, N); - scrypt_core_3way(X + 2 * 96, V, N); - scrypt_core_3way(X + 3 * 96, V, N); - - if ( work_restart[thrid].restart ) return 0; - - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; - - return 1; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#ifdef HAVE_SCRYPT_6WAY -static int scrypt_1024_1_1_256_24way( const uint32_t *input, - uint32_t *output, uint32_t *midstate, - unsigned char *scratchpad, int N, int thrid ) -{ - uint32_t _ALIGN(128) tstate[24 * 8]; - uint32_t _ALIGN(128) ostate[24 * 8]; - uint32_t _ALIGN(128) W[24 * 32]; - uint32_t _ALIGN(128) X[24 * 32]; - uint32_t *V; - int i, j, k; - - V = (uint32_t *)( ( (uintptr_t)(scratchpad) + 63 ) & ~ (uintptr_t)(63) ); - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 20; i++ ) - for ( k = 0; k < 8; k++ ) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 8; i++ ) - for ( k = 0; k < 8; k++ ) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - - HMAC_SHA256_80_init_8way( W + 0, tstate + 0, ostate + 0 ); - HMAC_SHA256_80_init_8way( W + 256, tstate + 64, ostate + 64 ); - HMAC_SHA256_80_init_8way( W + 512, tstate + 128, ostate + 128 ); - - if ( work_restart[thrid].restart ) return 0; - - PBKDF2_SHA256_80_128_8way( tstate + 0, ostate + 0, W + 0, W + 0 ); - PBKDF2_SHA256_80_128_8way( tstate + 64, ostate + 64, W + 256, W + 256 ); - PBKDF2_SHA256_80_128_8way( tstate + 128, ostate + 128, W + 512, W + 512 ); - - if ( work_restart[thrid].restart ) return 0; - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 32; i++ ) - for ( k = 0; k < 8; k++ ) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - - scrypt_core_6way( X + 0 * 32, V, N ); - scrypt_core_6way( X + 6 * 32, V, N ); - - if ( work_restart[thrid].restart ) return 0; - - scrypt_core_6way( X + 12 * 32, V, N ); - scrypt_core_6way( X + 18 * 32, V, N ); - - if ( work_restart[thrid].restart ) return 0; - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 32; i++ ) - for ( k = 0; k < 8; k++ ) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - - PBKDF2_SHA256_128_32_8way( tstate + 0, ostate + 0, W + 0, W + 0 ); - PBKDF2_SHA256_128_32_8way( tstate + 64, ostate + 64, W + 256, W + 256 ); - PBKDF2_SHA256_128_32_8way( tstate + 128, ostate + 128, W + 512, W + 512 ); - - for ( j = 0; j < 3; j++ ) - for ( i = 0; i < 8; i++ ) - for ( k = 0; k < 8; k++ ) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; - - return 1; -} -#endif /* HAVE_SCRYPT_6WAY */ +#endif // SHA extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) @@ -660,67 +1250,58 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; uint32_t midstate[8]; uint32_t n = pdata[19] - 1; - int thr_id = mythr->id; // thr_id arg is deprecated - int throughput = scrypt_best_throughput(); + int thr_id = mythr->id; + int throughput = scrypt_throughput; int i; volatile uint8_t *restart = &(work_restart[thr_id].restart); -#ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - -// applog(LOG_INFO,"Scrypt thoughput %d",throughput); + for ( i = 0; i < throughput; i++ ) + memcpy( data + i * 20, pdata, 80 ); + + sha256_transform_le( midstate, data, sha256_initial_state ); - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - do { bool rc = true; - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if defined(HAVE_SHA256_4WAY) - if (throughput == 4) - rc = scrypt_1024_1_1_256_4way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else + for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n; + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + if ( throughput == 16 ) + rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); + else #endif -#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) - if (throughput == 12) - rc = scrypt_1024_1_1_256_12way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else +#if defined(__AVX2__) + if ( throughput == 8 ) + rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); + else #endif -#if defined(HAVE_SCRYPT_6WAY) - if (throughput == 24) - rc = scrypt_1024_1_1_256_24way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else + if ( throughput == 4 ) +#if defined(__SHA__) + rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); +#else + rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); #endif -#if defined(HAVE_SCRYPT_3WAY) - if (throughput == 3) - rc = scrypt_1024_1_1_256_3way(data, hash, midstate, - scratchbuf, scratchbuf_size, thr_id ); - else -#endif - rc = scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, - scratchbuf_size, thr_id ); - + else + rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); + if ( rc ) for ( i = 0; i < throughput; i++ ) { - if ( unlikely( valid_hash( hash + i * 8, ptarget ) ) ) + if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) ) { - pdata[19] = data[i * 20 + 19]; +// applog( LOG_INFO, "Thread %d, Lane %d", thr_id,i ); + pdata[19] = data[i * 20 + 19]; submit_solution( work, hash + i * 8, mythr ); - } + } } - } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) ); + + + } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) ); *hashes_done = n - pdata[19]; pdata[19] = n; @@ -729,28 +1310,51 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, bool scrypt_miner_thread_init( int thr_id ) { - scratchbuf = scrypt_buffer_alloc( scratchbuf_size ); - if ( scratchbuf ) - return true; - applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); - return false; + scratchbuf = _mm_malloc( scratchbuf_size, 128 ); + if ( scratchbuf ) + return true; + applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); + return false; } bool register_scrypt_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX2_OPT; - gate->miner_thread_init =(void*)&scrypt_miner_thread_init; - gate->scanhash = (void*)&scanhash_scrypt; - opt_target_factor = 65536.0; +#if defined(__SHA__) + gate->optimizations = SSE2_OPT | SHA_OPT; +#else + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; +#endif + gate->miner_thread_init =(void*)&scrypt_miner_thread_init; + gate->scanhash = (void*)&scanhash_scrypt; + opt_target_factor = 65536.0; + opt_param_n = opt_param_n ? opt_param_n : 1024; + applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n ); - if ( !opt_param_n ) - { - opt_param_n = 1024; - scratchbuf_size = 1024; - } - else - scratchbuf_size = opt_param_n; - applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n ); - return true; +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + scrypt_throughput = 16; + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf +#elif defined(__SHA__) + scrypt_throughput = 4; + scratchbuf_size = opt_param_n * 2 * 128; // 2 buf +#elif defined(__AVX2__) + scrypt_throughput = 8; + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf +#else + scrypt_throughput = 4; + scratchbuf_size = opt_param_n * 2 * 128; // 2 buf +#endif + + char t_units[4] = {0}; + char d_units[4] = {0}; + double t_size = (double)scratchbuf_size; + double d_size = (double)scratchbuf_size * opt_n_threads; + + format_number_si( &t_size, t_units ); + format_number_si( &d_size, d_units ); + + applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n", + scrypt_throughput, t_size, t_units, d_size, d_units ); + + return true; }; diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c index e09a4c2..2cdf9c8 100644 --- a/algo/sha/hmac-sha256-hash.c +++ b/algo/sha/hmac-sha256-hash.c @@ -39,10 +39,10 @@ void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] ) { - sph_sha256_context ctx; - sph_sha256_init( &ctx ); - sph_sha256( &ctx, in, len ); - sph_sha256_close( &ctx, digest ); + sha256_context ctx; + sha256_ctx_init( &ctx ); + sha256_update( &ctx, in, len ); + sha256_final( &ctx, digest ); } /** @@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len, void HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen ) { - unsigned char pad[64]; + unsigned char pad[64] __attribute__ ((aligned (64))); unsigned char khash[32]; const unsigned char * K = _K; size_t i; @@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen ) /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - sph_sha256_init( &ctx->ictx ); - sph_sha256( &ctx->ictx, K, Klen ); - sph_sha256_close( &ctx->ictx, khash ); - + sha256_ctx_init( &ctx->ictx ); + sha256_update( &ctx->ictx, K, Klen ); + sha256_final( &ctx->ictx, khash ); K = khash; Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - sph_sha256_init( &ctx->ictx ); + sha256_ctx_init( &ctx->ictx ); for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36; memset( pad + Klen, 0x36, 64 - Klen ); - sph_sha256( &ctx->ictx, pad, 64 ); + sha256_update( &ctx->ictx, pad, 64 ); /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - sph_sha256_init( &ctx->octx ); + sha256_ctx_init( &ctx->octx ); for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c; memset( pad + Klen, 0x5c, 64 - Klen ); - sph_sha256( &ctx->octx, pad, 64 ); + sha256_update( &ctx->octx, pad, 64 ); } /* Add bytes to the HMAC-SHA256 operation. */ @@ -102,18 +101,17 @@ void HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ - sph_sha256( &ctx->ictx, in, len ); + sha256_update( &ctx->ictx, in, len ); } /* Finish an HMAC-SHA256 operation. */ void -HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx ) +HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx ) { - unsigned char ihash[32]; - - sph_sha256_close( &ctx->ictx, ihash ); - sph_sha256( &ctx->octx, ihash, 32 ); - sph_sha256_close( &ctx->octx, digest ); + uint32_t ihash[8] __attribute__ ((aligned (32))); + sha256_final( &ctx->ictx, ihash ); + sha256_update( &ctx->octx, ihash, 32 ); + sha256_final( &ctx->octx, digest ); } /** @@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt, size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen ) { HMAC_SHA256_CTX PShctx, hctx; - uint8_t _ALIGN(128) T[32]; - uint8_t _ALIGN(128) U[32]; + uint64_t _ALIGN(128) T[4]; + uint64_t _ALIGN(128) U[4]; +// uint8_t _ALIGN(128) T[32]; +// uint8_t _ALIGN(128) U[32]; uint32_t ivec; size_t i, clen; uint64_t j; @@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt, // _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] ); // _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] ); -// for ( k = 0; k < 4; k++ ) T[k] ^= U[k]; + for ( k = 0; k < 4; k++ ) T[k] ^= U[k]; - for ( k = 0; k < 32; k++ ) - T[k] ^= U[k]; +// for ( k = 0; k < 32; k++ ) +// T[k] ^= U[k]; } /* Copy as many bytes as necessary into buf. */ diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h index a735c53..7a281df 100644 --- a/algo/sha/hmac-sha256-hash.h +++ b/algo/sha/hmac-sha256-hash.h @@ -31,18 +31,18 @@ #include #include -#include "sph_sha2.h" +#include "sha256-hash.h" typedef struct HMAC_SHA256Context { - sph_sha256_context ictx; - sph_sha256_context octx; + sha256_context ictx; + sha256_context octx; } HMAC_SHA256_CTX; void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t ); void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t ); -void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * ); +void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * ); void HMAC_SHA256_Buf( const void *, size_t Klen, const void *, size_t len, uint8_t digest[32] ); diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index f9505d1..7b6618c 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -59,7 +59,9 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); void sha256_4way_full( void *dst, const void *data, size_t len ); -void sha256_4way_transform( __m128i *state_out, const __m128i *data, +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, const __m128i *state_in ); #endif // SSE2 @@ -79,8 +81,10 @@ void sha256_8way_init( sha256_8way_context *sc ); void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); void sha256_8way_close( sha256_8way_context *sc, void *dst ); void sha256_8way_full( void *dst, const void *data, size_t len ); -void sha256_8way_transform( __m256i *state_out, const __m256i *data, - const __m256i *state_in ); +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); #endif // AVX2 @@ -99,7 +103,9 @@ void sha256_16way_init( sha256_16way_context *sc ); void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); void sha256_16way_close( sha256_16way_context *sc, void *dst ); void sha256_16way_full( void *dst, const void *data, size_t len ); -void sha256_16way_transform( __m512i *state_out, const __m512i *data, +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, const __m512i *state_in ); void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, const __m512i *state_in ); diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 7eb4067..2a229bf 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -180,6 +180,7 @@ static const uint32_t sha256d_hash1[16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000100 }; +// this performs the entire hash all over again, why? static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) { uint32_t S[16]; @@ -195,6 +196,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) hash[i] = swab32(hash[i]); } +/* #if defined (__SHA__) #include "algo/sha/sph_sha2.h" @@ -241,6 +243,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len) } #endif +*/ static inline void sha256d_preextend(uint32_t *W) { @@ -653,6 +656,7 @@ int scanhash_sha256d( struct work *work, return 0; } +/* int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -682,13 +686,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, pdata[19] = n; return 0; } - +*/ bool register_sha256d_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_sha256d; - gate->hash = (void*)&sha256d; +// gate->hash = (void*)&sha256d; return true; }; diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c index f169b63..7fc64ca 100644 --- a/algo/sha/sha256-hash-2way-ni.c +++ b/algo/sha/sha256-hash-2way-ni.c @@ -7,9 +7,9 @@ #if defined(__SHA__) -#include "sha256-hash-opt.h" +#include "sha256-hash.h" -void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y, +void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ) { @@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y, _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); } +void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK ); + TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK ); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK ); + TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK ); + TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK ); + TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK ); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + + #endif + diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index c5f6048..beac702 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -74,17 +74,6 @@ static const uint32_t K256[64] = #define CHs(X, Y, Z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) -/* -#define MAJs(X, Y, Z) \ - _mm_or_si128( _mm_and_si128( X, Y ), \ - _mm_and_si128( _mm_or_si128( X, Y ), Z ) ) -*/ -/* -#define MAJs(X, Y, Z) \ - _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \ - _mm_xor_si128( Y, Z ) ) ) -*/ - #define MAJs(X, Y, Z) \ _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \ Y_xor_Z ) ) @@ -105,38 +94,6 @@ static const uint32_t K256[64] = _mm_xor_si128( _mm_xor_si128( \ mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) -/* -#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ -do { \ - __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \ - __m128i T1 = mm128_ror_32( E, 14 ); \ - __m128i T2 = mm128_ror_32( A, 9 ); \ - __m128i T3 = _mm_xor_si128( F, G ); \ - __m128i T4 = _mm_or_si128( A, B ); \ - __m128i T5 = _mm_and_si128( A, B ); \ - K = _mm_add_epi32( K, W[i] ); \ - T1 = _mm_xor_si128( T1, E ); \ - T2 = _mm_xor_si128( T2, A ); \ - T3 = _mm_and_si128( T3, E ); \ - T4 = _mm_and_si128( T4, C ); \ - K = _mm_add_epi32( H, K ); \ - T1 = mm128_ror_32( T1, 5 ); \ - T2 = mm128_ror_32( T2, 11 ); \ - T3 = _mm_xor_si128( T3, G ); \ - T4 = _mm_or_si128( T4, T5 ); \ - T1 = _mm_xor_si128( T1, E ); \ - T2 = _mm_xor_si128( T2, A ); \ - T1 = mm128_ror_32( T1, 6 ); \ - T2 = mm128_ror_32( T2, 2 ); \ - T1 = _mm_add_epi32( T1, T3 ); \ - T2 = _mm_add_epi32( T2, T4 ); \ - T1 = _mm_add_epi32( T1, K ); \ - H = _mm_add_epi32( T1, T2 ); \ - D = _mm_add_epi32( D, T1 ); \ -} while (0) -*/ - - #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m128i T1, T2; \ @@ -149,8 +106,8 @@ do { \ H = _mm_add_epi32( T1, T2 ); \ } while (0) - -void sha256_4way_transform( __m128i *state_out, const __m128i *data, +// LE data, no need to byte swap +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, const __m128i *state_in ) { __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; @@ -232,6 +189,91 @@ void sha256_4way_transform( __m128i *state_out, const __m128i *data, state_out[7] = _mm_add_epi32( state_in[7], H ); } +// BE data, need to byte swap +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + __m128i W[16]; + + mm128_block_bswap_32( W, data ); + mm128_block_bswap_32( W+8, data+8 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + Y_xor_Z = _mm_xor_si128( B, C ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm_add_epi32( state_in[0], A ); + state_out[1] = _mm_add_epi32( state_in[1], B ); + state_out[2] = _mm_add_epi32( state_in[2], C ); + state_out[3] = _mm_add_epi32( state_in[3], D ); + state_out[4] = _mm_add_epi32( state_in[4], E ); + state_out[5] = _mm_add_epi32( state_in[5], F ); + state_out[6] = _mm_add_epi32( state_in[6], G ); + state_out[7] = _mm_add_epi32( state_in[7], H ); +} + + static void sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) { @@ -436,61 +478,81 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) // SHA-256 8 way -#if defined(__AVX512VL__) - -#define CHx(X, Y, Z) \ - _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) - -#define MAJx(X, Y, Z) \ - _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 ) - #define BSG2_0x(x) \ - mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \ + mm256_ror_32( x, 13 ) ), \ + mm256_ror_32( x, 22 ) ) #define BSG2_1x(x) \ - mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \ + mm256_ror_32( x, 11 ) ), \ + mm256_ror_32( x, 25 ) ) #define SSG2_0x(x) \ - mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \ + mm256_ror_32( x, 18 ) ), \ + _mm256_srli_epi32( x, 3 ) ) #define SSG2_1x(x) \ - mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) ) + _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \ + mm256_ror_32( x, 19 ) ), \ + _mm256_srli_epi32( x, 10 ) ) + +#define SHA2x_MEXP( a, b, c, d ) \ + mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); + +// With AVX512VL ternary logic optimizations are available. +// If not optimize by forwarding the result of X^Y in MAJ to the next round +// to avoid recalculating it as Y^Z. This optimization is not applicable +// when MAJ is optimized with ternary logic. + +#if defined(__AVX512VL__) + +#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) + +#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 ) + +#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ +do { \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \ + W[ i ] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} while (0) #else // AVX2 #define CHx(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) -#define MAJx(X, Y, Z) \ - _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \ - _mm256_xor_si256( Y, Z ) ) ) -/* +// Use saved X_xor_Y from previous round, now called Y_xor_Z, +// and save new X_xor_Y, for next round. #define MAJx(X, Y, Z) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) -*/ -#define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) ) - -#define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) ) - -#define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) ) - -#define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) ) - -#endif // AVX512 else AVX2 - -#define SHA2x_MEXP( a, b, c, d ) \ - mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); +#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ +do { \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \ + W[ i ] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + Y_xor_Z = X_xor_Y; \ + D = _mm256_add_epi32( D, T1 ); \ + H = _mm256_add_epi32( T1, T2 ); \ +} while (0) +/* #define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m256i T1, T2; \ @@ -498,16 +560,23 @@ do { \ T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ K, W[i] ) ); \ T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) +*/ -void sha256_8way_transform( __m256i *state_out, const __m256i *data, +#endif // AVX512VL else AVX2 + +// accepts LE byte ordered data, skip the byte swap +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, const __m256i *state_in ) { __m256i A, B, C, D, E, F, G, H; +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z; +#endif __m256i W[16]; - memcpy_256( W, data, 16 ); A = state_in[0]; @@ -519,6 +588,101 @@ void sha256_8way_transform( __m256i *state_out, const __m256i *data, G = state_in[6]; H = state_in[7]; +#if !defined(__AVX512VL__) + Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm256_add_epi32( state_in[0], A ); + state_out[1] = _mm256_add_epi32( state_in[1], B ); + state_out[2] = _mm256_add_epi32( state_in[2], C ); + state_out[3] = _mm256_add_epi32( state_in[3], D ); + state_out[4] = _mm256_add_epi32( state_in[4], E ); + state_out[5] = _mm256_add_epi32( state_in[5], F ); + state_out[6] = _mm256_add_epi32( state_in[6], G ); + state_out[7] = _mm256_add_epi32( state_in[7], H ); +} + + +// Accepts BE byte ordered data, need to byte swap +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) +{ + __m256i A, B, C, D, E, F, G, H; +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z; +#endif + __m256i W[16]; + + mm256_block_bswap_32( W , data ); + mm256_block_bswap_32( W+8, data+8 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + +#if !defined(__AVX512VL__) + Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); @@ -587,6 +751,9 @@ static void sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) { register __m256i A, B, C, D, E, F, G, H; +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z; +#endif __m256i W[16]; mm256_block_bswap_32( W , in ); @@ -615,6 +782,10 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) H = m256_const1_64( 0x5BE0CD195BE0CD19 ); } +#if !defined(__AVX512VL__) + Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); @@ -790,27 +961,44 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) // SHA-256 16 way -#define CHx16(X, Y, Z) \ - _mm512_ternarylogic_epi32( X, Y, Z, 0xca ) +#define CHx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xca ) -#define MAJx16(X, Y, Z) \ - _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 ) +#define MAJx16(X, Y, Z) _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 ) -#define BSG2_0x16(x) \ - mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) ) +#define BSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 2 ), \ + _mm512_ror_epi32( x, 13 ), \ + _mm512_ror_epi32( x, 22 ) ) -#define BSG2_1x16(x) \ - mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) ) +#define BSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 6 ), \ + _mm512_ror_epi32( x, 11 ), \ + _mm512_ror_epi32( x, 25 ) ) -#define SSG2_0x16(x) \ - mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) ) +#define SSG2_0x16(x) mm512_xor3( _mm512_ror_epi32( x, 7 ), \ + _mm512_ror_epi32( x, 18 ), \ + _mm512_srli_epi32( x, 3 ) ) -#define SSG2_1x16(x) \ - mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) ) +#define SSG2_1x16(x) mm512_xor3( _mm512_ror_epi32( x, 17 ), \ + _mm512_ror_epi32( x, 19 ), \ + _mm512_srli_epi32( x, 10 ) ) #define SHA2x16_MEXP( a, b, c, d ) \ mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); +#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ +do { \ + __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \ + W[ i ] ); \ + __m512i T1 = BSG2_1x16( E ); \ + __m512i T2 = BSG2_0x16( A ); \ + T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \ + T1 = _mm512_add_epi32( T1, H ); \ + T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \ + T1 = _mm512_add_epi32( T1, T0 ); \ + D = _mm512_add_epi32( D, T1 ); \ + H = _mm512_add_epi32( T1, T2 ); \ +} while (0) + +/* #define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m512i T1, T2; \ @@ -821,14 +1009,10 @@ do { \ D = _mm512_add_epi32( D, T1 ); \ H = _mm512_add_epi32( T1, T2 ); \ } while (0) +*/ -// Tranform one 16 lane by 64 byte message block and update state. -// Calling function is responsible for initializing the state, setting -// correct byte order, counting bits and padding of the final block. -// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating -// redundant byte swapping. -// -void sha256_16way_transform( __m512i *state_out, const __m512i *data, +// accepts LE input data +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, const __m512i *state_in ) { __m512i A, B, C, D, E, F, G, H; @@ -909,6 +1093,89 @@ void sha256_16way_transform( __m512i *state_out, const __m512i *data, state_out[7] = _mm512_add_epi32( state_in[7], H ); } +// Accepts BE input data, need to bswap +void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; + + mm512_block_bswap_32( W , data ); + mm512_block_bswap_32( W+8, data+8 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm512_add_epi32( state_in[0], A ); + state_out[1] = _mm512_add_epi32( state_in[1], B ); + state_out[2] = _mm512_add_epi32( state_in[2], C ); + state_out[3] = _mm512_add_epi32( state_in[3], D ); + state_out[4] = _mm512_add_epi32( state_in[4], E ); + state_out[5] = _mm512_add_epi32( state_in[5], F ); + state_out[6] = _mm512_add_epi32( state_in[6], G ); + state_out[7] = _mm512_add_epi32( state_in[7], H ); +} + // Aggresive prehashing void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, const __m512i *state_in ) diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c index 78bda65..e08dd60 100644 --- a/algo/sha/sha256-hash-opt.c +++ b/algo/sha/sha256-hash-opt.c @@ -7,9 +7,9 @@ #if defined(__SHA__) -#include "sha256-hash-opt.h" +#include "sha256-hash.h" -void sha256_opt_transform( uint32_t *state_out, const void *input, +void sha256_opt_transform_le( uint32_t *state_out, const void *input, const uint32_t *state_in ) { __m128i STATE0, STATE1; @@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input, _mm_store_si128((__m128i*) &state_out[4], STATE1); } + +void sha256_opt_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); + TMSG0 = _mm_shuffle_epi8( TMSG0, MASK ); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i*) (input+16)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i*) (input+32)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i*) (input+48)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); +} + #endif diff --git a/algo/sha/sha256-hash-opt.h b/algo/sha/sha256-hash-opt.h deleted file mode 100644 index 9ceacf4..0000000 --- a/algo/sha/sha256-hash-opt.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef SHA2_HASH_OPT_H__ -#define SHA2_HASH_OPT_H__ 1 - -#include -#include "simd-utils.h" - -#if defined(__SHA__) - -void sha256_opt_transform( uint32_t *state_out, const void *input, - const uint32_t *state_in ); - -// 2 way with interleaved instructions -void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y, - const void *msg_X, const void *msg_Y, - const uint32_t *in_X, const uint32_t *in_Y ); - -#endif -#endif diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c new file mode 100644 index 0000000..ddbaacc --- /dev/null +++ b/algo/sha/sha256-hash.c @@ -0,0 +1,142 @@ +#include "sha256-hash.h" + +static const uint32_t SHA256_IV[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +/* +static const uint8_t SHA256_PAD[64] = +{ + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +*/ + +void sha256_ctx_init( sha256_context *ctx ) +{ + memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV ); + ctx->count = 0; +} + +void sha256_update( sha256_context *ctx, const void *data, size_t len ) +{ + int ptr = ctx->count & 0x3f; + const uint8_t *src = data; + + ctx->count += (uint64_t)len; + + if ( len < 64 - ptr ) + { + memcpy( ctx->buf + ptr, src, len ); + return; + } + + memcpy( ctx->buf + ptr, src, 64 - ptr ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + src += 64 - ptr; + len -= 64 - ptr; + + while ( len >= 64 ) + { + sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state ); + src += 64; + len -= 64; + } + + memcpy( ctx->buf, src, len ); +} + +#if 0 +void sha256_final( sha256_context *ctx, uint32_t *hash ) +{ + size_t r; + + + /* Figure out how many bytes we have buffered. */ + r = ctx->count & 0x3f; +// r = ( ctx->count >> 3 ) & 0x3f; + +//printf("final: count= %d, r= %d\n", ctx->count, r ); + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if ( r < 56 ) + { + /* Pad to 56 mod 64. */ + memcpy( &ctx->buf[r], SHA256_PAD, 56 - r ); + } + else + { + /* Finish the current block and mix. */ + memcpy( &ctx->buf[r], SHA256_PAD, 64 - r ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + +// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + /* The start of the final block is all zeroes. */ + memset( &ctx->buf[0], 0, 56 ); + } + + /* Add the terminating bit-count. */ + ctx->buf[56] = bswap_64( ctx->count << 3 ); +// ctx->buf[56] = bswap_64( ctx->count ); +// be64enc( &ctx->buf[56], ctx->count ); + + /* Mix in the final block. */ + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + +// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] ); + +// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i ); + +/* +// be32enc_vect(digest, ctx->state, 4); +// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len) + // Encode vector, two words at a time. + do { + be32enc(&dst[0], src[0]); + be32enc(&dst[4], src[1]); + src += 2; + dst += 8; + } while (--len); +*/ + +} +#endif + +void sha256_final( sha256_context *ctx, void *hash ) +{ + int ptr = ctx->count & 0x3f; + + ctx->buf[ ptr++ ] = 0x80; + + if ( ptr > 56 ) + { + memset( ctx->buf + ptr, 0, 64 - ptr ); + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + memset( ctx->buf, 0, 56 ); + } + else + memset( ctx->buf + ptr, 0, 56 - ptr ); + + *(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 ); + + sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state ); + + for ( int i = 0; i < 8; i++ ) + ( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] ); +} + +void sha256_full( void *hash, const void *data, size_t len ) +{ + sha256_context ctx; + sha256_ctx_init( &ctx ); + sha256_update( &ctx, data, len ); + sha256_final( &ctx, hash ); +} + diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h new file mode 100644 index 0000000..c6d61d8 --- /dev/null +++ b/algo/sha/sha256-hash.h @@ -0,0 +1,56 @@ +#ifndef SHA256_HASH_H__ +#define SHA256_HASH_H__ 1 + +#include +#include "simd-utils.h" +#include "cpuminer-config.h" +#include "sph_sha2.h" + + +// generic interface + +typedef struct { + unsigned char buf[64]; /* first field, for alignment */ + uint32_t state[8]; + uint64_t count; +} sha256_context __attribute__((aligned(64))); + +void sha256_full( void *hash, const void *data, size_t len ); +void sha256_update( sha256_context *ctx, const void *data, size_t len ); +void sha256_final( sha256_context *ctx, void *hash ); +void sha256_ctx_init( sha256_context *ctx ); +void sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); +void sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + +#if defined(__SHA__) + +void sha256_opt_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +void sha256_opt_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +// 2 way with interleaved instructions +void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +// Select target +// with SHA... +#define sha256_transform_le sha256_opt_transform_le +#define sha256_transform_be sha256_opt_transform_be + +#else + +// without SHA... +#define sha256_transform_le sph_sha256_transform_le +#define sha256_transform_be sph_sha256_transform_be + +#endif +#endif diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c index 9bbc5c8..fd3ae2f 100644 --- a/algo/sha/sha256d-4way.c +++ b/algo/sha/sha256d-4way.c @@ -14,6 +14,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, __m512i hash32[8] __attribute__ ((aligned (32))); __m512i initstate[8] __attribute__ ((aligned (32))); __m512i midstate[8] __attribute__ ((aligned (32))); + __m512i midstate2[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); __m512i vdata[20] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); @@ -23,7 +24,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 16; uint32_t n = first_nonce; - __m512i *noncev = vdata + 19; + __m512i *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; const __m512i last_byte = m512_const1_32( 0x80000000 ); @@ -45,27 +46,30 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); - // hash first 64 bytes of data - sha256_16way_transform( midstate, vdata, initstate ); + // hash first 64 byte block of data + sha256_16way_transform_le( midstate, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate ); do { // 1. final 16 bytes of data, with padding memcpy_512( block, vdata + 16, 4 ); block[ 4] = last_byte; - memset_zero_512( block + 5, 10 ); + memset_zero_512( block + 5, 10 ); block[15] = m512_const1_32( 80*8 ); // bit count - sha256_16way_transform( hash32, block, midstate ); + sha256_16way_final_rounds( hash32, block, midstate, midstate2 ); // 2. 32 byte hash from 1. memcpy_512( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_512( block + 9, 6 ); block[15] = m512_const1_32( 32*8 ); // bit count - sha256_16way_transform( hash32, block, initstate ); + sha256_16way_transform_le( hash32, block, initstate ); // byte swap final hash for testing - mm512_block_bswap_32( hash32, hash32 ); + mm512_block_bswap_32( hash32, hash32 ); for ( int lane = 0; lane < 16; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) @@ -85,7 +89,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, return 0; } - #endif #if defined(SHA256D_8WAY) @@ -128,7 +131,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 bytes of data - sha256_8way_transform( midstate, vdata, initstate ); + sha256_8way_transform_le( midstate, vdata, initstate ); do { @@ -137,14 +140,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, block[ 4] = last_byte; memset_zero_256( block + 5, 10 ); block[15] = m256_const1_32( 80*8 ); // bit count - sha256_8way_transform( hash32, block, midstate ); + sha256_8way_transform_le( hash32, block, midstate ); // 2. 32 byte hash from 1. memcpy_256( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_256( block + 9, 6 ); block[15] = m256_const1_32( 32*8 ); // bit count - sha256_8way_transform( hash32, block, initstate ); + sha256_8way_transform_le( hash32, block, initstate ); // byte swap final hash for testing mm256_block_bswap_32( hash32, hash32 ); @@ -209,7 +212,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 bytes of data - sha256_4way_transform( midstate, vdata, initstate ); + sha256_4way_transform_le( midstate, vdata, initstate ); do { @@ -218,14 +221,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, block[ 4] = last_byte; memset_zero_128( block + 5, 10 ); block[15] = m128_const1_32( 80*8 ); // bit count - sha256_4way_transform( hash32, block, midstate ); + sha256_4way_transform_le( hash32, block, midstate ); // 2. 32 byte hash from 1. memcpy_128( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_128( block + 9, 6 ); block[15] = m128_const1_32( 32*8 ); // bit count - sha256_4way_transform( hash32, block, initstate ); + sha256_4way_transform_le( hash32, block, initstate ); // byte swap final hash for testing mm128_block_bswap_32( hash32, hash32 ); diff --git a/algo/sha/sha256d.c b/algo/sha/sha256d.c new file mode 100644 index 0000000..ed4bd60 --- /dev/null +++ b/algo/sha/sha256d.c @@ -0,0 +1,8 @@ +#include "sha256d.h" + +void sha256d( void *hash, const void *data, int len ) +{ + sha256_full( hash, data, len ); + sha256_full( hash, hash, 32 ); +} + diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h new file mode 100644 index 0000000..71f78ee --- /dev/null +++ b/algo/sha/sha256d.h @@ -0,0 +1,7 @@ +#include "algo-gate-api.h" +#include +#include +#include "sha256-hash.h" + +void sha256d( void *hash, const void *data, int len ); + diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c index cf9890e..90a2b7b 100644 --- a/algo/sha/sha256q.c +++ b/algo/sha/sha256q.c @@ -3,14 +3,14 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" -static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64))); +static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64))); void sha256q_midstate( const void* input ) { - sph_sha256_init( &sha256q_ctx ); - sph_sha256( &sha256q_ctx, input, 64 ); + sha256_ctx_init( &sha256q_ctx ); + sha256_update( &sha256q_ctx, input, 64 ); } int sha256q_hash( void* output, const void* input ) @@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input ) const int midlen = 64; // bytes const int tail = 80 - midlen; // 16 - sph_sha256_context ctx __attribute__ ((aligned (64))); + sha256_context ctx __attribute__ ((aligned (64))); memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx ); - sph_sha256( &ctx, input + midlen, tail ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, output ); + sha256_update( &ctx, input + midlen, tail ); + sha256_final( &ctx, hash ); + sha256_full( hash, hash, 32 ); + sha256_full( hash, hash, 32 ); + sha256_full( output, hash, 32 ); + return 1; } diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 0f4fb58..12cbcde 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -47,7 +47,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 byte block of data - sha256_16way_transform( midstate, vdata, initstate ); + sha256_16way_transform_le( midstate, vdata, initstate ); // Do 3 rounds on the first 12 bytes of the next block sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate ); @@ -60,18 +60,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, memset_zero_512( block + 5, 10 ); block[15] = m512_const1_32( 80*8 ); // bit count sha256_16way_final_rounds( hash32, block, midstate, midstate2 ); -// sha256_16way_transform( hash32, block, midstate ); // 2. 32 byte hash from 1. memcpy_512( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_512( block + 9, 6 ); block[15] = m512_const1_32( 32*8 ); // bit count - sha256_16way_transform( hash32, block, initstate ); + sha256_16way_transform_le( hash32, block, initstate ); // 3. 32 byte hash from 2. memcpy_512( block, hash32, 8 ); - sha256_16way_transform( hash32, block, initstate ); + sha256_16way_transform_le( hash32, block, initstate ); // byte swap final hash for testing mm512_block_bswap_32( hash32, hash32 ); @@ -137,7 +136,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 bytes of data - sha256_8way_transform( midstate, vdata, initstate ); + sha256_8way_transform_le( midstate, vdata, initstate ); do { @@ -146,18 +145,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, block[ 4] = last_byte; memset_zero_256( block + 5, 10 ); block[15] = m256_const1_32( 80*8 ); // bit count - sha256_8way_transform( hash32, block, midstate ); + sha256_8way_transform_le( hash32, block, midstate ); // 2. 32 byte hash from 1. memcpy_256( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_256( block + 9, 6 ); block[15] = m256_const1_32( 32*8 ); // bit count - sha256_8way_transform( hash32, block, initstate ); + sha256_8way_transform_le( hash32, block, initstate ); // 3. 32 byte hash from 2. memcpy_256( block, hash32, 8 ); - sha256_8way_transform( hash32, block, initstate ); + sha256_8way_transform_le( hash32, block, initstate ); // byte swap final hash for testing mm256_block_bswap_32( hash32, hash32 ); @@ -222,7 +221,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 bytes of data - sha256_4way_transform( midstate, vdata, initstate ); + sha256_4way_transform_le( midstate, vdata, initstate ); do { @@ -231,18 +230,18 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, block[ 4] = last_byte; memset_zero_128( block + 5, 10 ); block[15] = m128_const1_32( 80*8 ); // bit count - sha256_4way_transform( hash32, block, midstate ); + sha256_4way_transform_le( hash32, block, midstate ); // 2. 32 byte hash from 1. memcpy_128( block, hash32, 8 ); block[ 8] = last_byte; memset_zero_128( block + 9, 6 ); block[15] = m128_const1_32( 32*8 ); // bit count - sha256_4way_transform( hash32, block, initstate ); + sha256_4way_transform_le( hash32, block, initstate ); // 3. 32 byte hash from 2. memcpy_128( block, hash32, 8 ); - sha256_4way_transform( hash32, block, initstate ); + sha256_4way_transform_le( hash32, block, initstate ); // byte swap final hash for testing mm128_block_bswap_32( hash32, hash32 ); diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c index 90d2754..c528d27 100644 --- a/algo/sha/sha256t.c +++ b/algo/sha/sha256t.c @@ -4,120 +4,12 @@ #include #include //#include "algo/sha/sph_sha2.h" -#include "sha256-hash-opt.h" +#include "sha256-hash.h" #if defined(__SHA__) // Only used on CPUs with SHA -/* -static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64))); - -void sha256t_midstate( const void* input ) -{ - sph_sha256_init( &sha256t_ctx ); - sph_sha256( &sha256t_ctx, input, 64 ); -} - -int sha256t_hash( void* output, const void* input ) -{ - uint32_t _ALIGN(64) hash[16]; - const int midlen = 64; // bytes - const int tail = 80 - midlen; // 16 - - sph_sha256_context ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx ); - - sph_sha256( &ctx, input + midlen, tail ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, hash ); - - sph_sha256_init( &ctx ); - sph_sha256( &ctx, hash, 32 ); - sph_sha256_close( &ctx, output ); - - return 1; -} -*/ - -/* -int scanhash_sha256t( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t block[16] __attribute__ ((aligned (64))); - uint32_t hash32[8] __attribute__ ((aligned (32))); - uint32_t initstate[8] __attribute__ ((aligned (32))); - uint32_t midstate[8] __attribute__ ((aligned (32))); - - - -// uint32_t edata[20] __attribute__((aligned(64))); -// uint32_t hash[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 1; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - const bool bench = opt_benchmark; - __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - -// mm128_bswap32_80( edata, pdata ); -// sha256t_midstate( edata ); - - // initialize state - initstate[0] = 0x6A09E667; - initstate[1] = 0xBB67AE85; - initstate[2] = 0x3C6EF372; - initstate[3] = 0xA54FF53A; - initstate[4] = 0x510E527F; - initstate[5] = 0x9B05688C; - initstate[6] = 0x1F83D9AB; - initstate[7] = 0x5BE0CD19; - - // hash first 64 bytes of data - sha256_opt_transform( midstate, pdata, initstate ); - - do - { - // 1. final 16 bytes of data, with padding - memcpy( block, pdata + 16, 16 ); - block[ 4] = 0x80000000; - memset( block + 5, 0, 40 ); - block[15] = 80*8; // bit count - sha256_opt_transform( hash32, block, midstate ); - - // 2. 32 byte hash from 1. - memcpy( block, hash32, 32 ); - block[ 8] = 0x80000000; - memset( block + 9, 0, 24 ); - block[15] = 32*8; // bit count - sha256_opt_transform( hash32, block, initstate ); - - // 3. 32 byte hash from 2. - memcpy( block, hash32, 32 ); - sha256_opt_transform( hash32, block, initstate ); - - // byte swap final hash for testing - casti_m128i( hash32, 0 ) = - _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 ); - casti_m128i( hash32, 1 ) = - _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 ); - - if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) - submit_solution( work, hash32, mythr ); - n++; - pdata[19] = n; - } while ( (n < last_nonce) && !work_restart[thr_id].restart ); - - *hashes_done = n - first_nonce; - return 0; -} -*/ int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) @@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce, initstate[7] = 0x5BE0CD19; // hash first 64 bytes of data - sha256_opt_transform( midstate, pdata, initstate ); + sha256_opt_transform_le( midstate, pdata, initstate ); do { @@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce, memset( block0 + 5, 0, 40 ); memset( block1 + 5, 0, 40 ); block0[15] = block1[15] = 80*8; // bit count - sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate ); + sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate ); // 2. 32 byte hash from 1. memcpy( block0, hash0, 32 ); @@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce, memset( block0 + 9, 0, 24 ); memset( block1 + 9, 0, 24 ); block0[15] = block1[15] = 32*8; // bit count - sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate ); + sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); // 3. 32 byte hash from 2. memcpy( block0, hash0, 32 ); memcpy( block1, hash1, 32 ); - sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate ); + sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate ); // byte swap final hash for testing casti_m128i( hash0, 0 ) = diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index e41a92b..7c96d2e 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -95,32 +95,36 @@ static const uint64_t K512[80] = // SHA-512 8 way 64 bit -#define CH8W(X, Y, Z) \ - _mm512_ternarylogic_epi64( X, Y, Z, 0xca ) +#define CH8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xca ) -#define MAJ8W(X, Y, Z) \ - _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 ) +#define MAJ8W( X, Y, Z ) _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 ) -#define BSG8W_5_0(x) \ - mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) ) +#define BSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 28 ), \ + _mm512_ror_epi64( x, 34 ), \ + _mm512_ror_epi64( x, 39 ) ) -#define BSG8W_5_1(x) \ - mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) ) +#define BSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 14 ), \ + _mm512_ror_epi64( x, 18 ), \ + _mm512_ror_epi64( x, 41 ) ) -#define SSG8W_5_0(x) \ - mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) ) +#define SSG8W_5_0( x ) mm512_xor3( _mm512_ror_epi64( x, 1 ), \ + _mm512_ror_epi64( x, 8 ), \ + _mm512_srli_epi64( x, 7 ) ) -#define SSG8W_5_1(x) \ - mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) ) +#define SSG8W_5_1( x ) mm512_xor3( _mm512_ror_epi64( x, 19 ), \ + _mm512_ror_epi64( x, 61 ), \ + _mm512_srli_epi64( x, 6 ) ) -#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \ +#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \ do { \ - __m512i T1, T2; \ - __m512i K = _mm512_set1_epi64( K512[ i ] ); \ - T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \ - D = _mm512_add_epi64( D, T1 ); \ + __m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \ + __m512i T1 = BSG8W_5_1( E ); \ + __m512i T2 = BSG8W_5_0( A ); \ + T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \ + T1 = _mm512_add_epi64( T1, H ); \ + T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \ + T1 = _mm512_add_epi64( T1, T0 ); \ + D = _mm512_add_epi64( D, T1 ); \ H = _mm512_add_epi64( T1, T2 ); \ } while (0) @@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) // SHA-512 4 way 64 bit - #define CH(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) -/* -#define MAJ(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) -*/ - #define MAJ(X, Y, Z) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) @@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst ) mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \ _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 ) -/* -#define BSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) ) - -#define BSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) ) -*/ /* #define SSG5_0(x) \ _mm256_xor_si256( _mm256_xor_si256( \ @@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 ) return _mm256_add_epi64( w0a, w1a ); } -/* -#define SSG512x2_0( w0, w1, i ) do \ -{ \ - __m256i X0a, X1a, X0b, X1b; \ - X0a = mm256_ror_64( W[i-15], 1 ); \ - X1a = mm256_ror_64( W[i-14], 1 ); \ - X0b = mm256_ror_64( W[i-15], 8 ); \ - X1b = mm256_ror_64( W[i-14], 8 ); \ - X0a = _mm256_xor_si256( X0a, X0b ); \ - X1a = _mm256_xor_si256( X1a, X1b ); \ - X0b = _mm256_srli_epi64( W[i-15], 7 ); \ - X1b = _mm256_srli_epi64( W[i-14], 7 ); \ - w0 = _mm256_xor_si256( X0a, X0b ); \ - w1 = _mm256_xor_si256( X1a, X1b ); \ -} while(0) - -#define SSG512x2_1( w0, w1, i ) do \ -{ \ - __m256i X0a, X1a, X0b, X1b; \ - X0a = mm256_ror_64( W[i-2],19 ); \ - X1a = mm256_ror_64( W[i-1],19 ); \ - X0b = mm256_ror_64( W[i-2],61 ); \ - X1b = mm256_ror_64( W[i-1],61 ); \ - X0a = _mm256_xor_si256( X0a, X0b ); \ - X1a = _mm256_xor_si256( X1a, X1b ); \ - X0b = _mm256_srli_epi64( W[i-2], 6 ); \ - X1b = _mm256_srli_epi64( W[i-1], 6 ); \ - w0 = _mm256_xor_si256( X0a, X0b ); \ - w1 = _mm256_xor_si256( X1a, X1b ); \ -} while(0) -*/ -/* -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ +#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \ do { \ - __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ - __m256i T1 = mm256_ror_64( E, 23 ); \ - __m256i T2 = mm256_ror_64( A, 5 ); \ - __m256i T3 = _mm256_xor_si256( F, G ); \ - __m256i T4 = _mm256_or_si256( A, B ); \ - __m256i T5 = _mm256_and_si256( A, B ); \ - K = _mm256_add_epi64( K, W[i] ); \ - T1 = _mm256_xor_si256( T1, E ); \ - T2 = _mm256_xor_si256( T2, A ); \ - T3 = _mm256_and_si256( T3, E ); \ - T4 = _mm256_and_si256( T4, C ); \ - K = _mm256_add_epi64( H, K ); \ - T1 = mm256_ror_64( T1, 4 ); \ - T2 = mm256_ror_64( T2, 6 ); \ - T3 = _mm256_xor_si256( T3, G ); \ - T4 = _mm256_or_si256( T4, T5 ); \ - T1 = _mm256_xor_si256( T1, E ); \ - T2 = _mm256_xor_si256( T2, A ); \ - T1 = mm256_ror_64( T1, 14 ); \ - T2 = mm256_ror_64( T2, 28 ); \ - T1 = _mm256_add_epi64( T1, T3 ); \ - T2 = _mm256_add_epi64( T2, T4 ); \ - T1 = _mm256_add_epi64( T1, K ); \ - H = _mm256_add_epi64( T1, T2 ); \ - D = _mm256_add_epi64( D, T1 ); \ -} while (0) -*/ -/* -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - __m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \ - __m256i T1 = BSG5_1(E); \ - __m256i T2 = BSG5_0(A); \ - T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \ - T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \ - D = _mm256_add_epi64( D, T1 ); \ - H = _mm256_add_epi64( T1, T2 ); \ -} while (0) -*/ - - -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - __m256i T1, T2; \ - __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ - T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ + __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \ + __m256i T1 = BSG5_1( E ); \ + __m256i T2 = BSG5_0( A ); \ + T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \ + T1 = _mm256_add_epi64( T1, H ); \ + T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \ + T1 = _mm256_add_epi64( T1, T0 ); \ Y_xor_Z = X_xor_Y; \ - D = _mm256_add_epi64( D, T1 ); \ + D = _mm256_add_epi64( D, T1 ); \ H = _mm256_add_epi64( T1, T2 ); \ } while (0) - static void sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) { diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index 7e39954..a89fc8d 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -71,198 +71,6 @@ static const sph_u32 H256[8] = { * of the compression function. */ -#if defined(__SHA__) - -#include "simd-utils.h" - -static void sha2_round( const uint8_t input[], uint32_t state[8] ) -{ - __m128i STATE0, STATE1; - __m128i MSG, TMP, MASK; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_load_si128((__m128i*) &state[0]); - STATE1 = _mm_load_si128((__m128i*) &state[4]); - MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - MSG = _mm_load_si128((const __m128i*) (input+0)); - TMSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 4-7 - TMSG1 = _mm_load_si128((const __m128i*) (input+16)); - TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 8-11 - TMSG2 = _mm_load_si128((const __m128i*) (input+32)); - TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 12-15 - TMSG3 = _mm_load_si128((const __m128i*) (input+48)); - TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 16-19 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 20-23 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 24-27 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 28-31 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 32-35 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 36-39 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 40-43 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 44-47 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 48-51 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 52-55 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 56-59 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 60-63 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Add values back to state - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF - - // Save state - _mm_store_si128((__m128i*) &state[0], STATE0); - _mm_store_si128((__m128i*) &state[4], STATE1); -} - -#else // no SHA /* static const sph_u32 K[64] = { @@ -875,8 +683,24 @@ sha2_round(const unsigned char *data, sph_u32 r[8]) #undef SHA2_IN } -#endif // SHA else +void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ) +{ +memcpy( state_out, state_in, 32 ); +#define SHA2_IN(x) (data[x]) + SHA2_ROUND_BODY( SHA2_IN, state_out ); +#undef SHA2_IN +} +void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ) +{ +memcpy( state_out, state_in, 32 ); +#define SHA2_IN(x) sph_dec32be_aligned( data+(x) ) + SHA2_ROUND_BODY( SHA2_IN, state_out ); +#undef SHA2_IN + +} /* see sph_sha2.h */ void diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h index e3a83eb..b76c3f4 100644 --- a/algo/sha/sph_sha2.h +++ b/algo/sha/sph_sha2.h @@ -207,6 +207,13 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]); void sph_sha256_full( void *dst, const void *data, size_t len ); +// These shouldn't be called directly, use sha256-hash.h generic functions +// sha256_transform_le & sha256_transform_be instead. +void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); + +void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, + const uint32_t *state_in ); #if SPH_64 diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 1b77426..9c71459 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -20,8 +20,8 @@ static const uint32_t IV512[] = #define mm256_ror2x256hi_1x32( a, b ) \ - _mm256_blend_epi32( mm256_ror128_32( a ), \ - mm256_ror128_32( b ), 0x88 ) + _mm256_blend_epi32( mm256_shuflr128_32( a ), \ + mm256_shuflr128_32( b ), 0x88 ) #if defined(__VAES__) @@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) { // round 1, 5, 9 - k00 = _mm256_xor_si256( k13, mm256_ror128_32( + k00 = _mm256_xor_si256( k13, mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ) ); if ( r == 0 ) @@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); k01 = _mm256_xor_si256( k00, - mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) ); if ( r == 1 ) k01 = _mm256_xor_si256( k01, _mm256_set_epi32( @@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k01, - mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k02, - mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); k10 = _mm256_xor_si256( k03, - mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); k11 = _mm256_xor_si256( k10, - mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k11, - mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k12, - mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) ); + mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) ); if ( r == 2 ) k13 = _mm256_xor_si256( k13, _mm256_set_epi32( @@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 3, 7, 11 - k00 = _mm256_xor_si256( mm256_ror128_32( + k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror128_32( + k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror128_32( + k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror128_32( + k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p1 = _mm256_xor_si256( p1, x ); - k10 = _mm256_xor_si256( mm256_ror128_32( + k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror128_32( + k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( mm256_ror128_32( + k12 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ), k11 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror128_32( + k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); @@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 13 - k00 = _mm256_xor_si256( mm256_ror128_32( + k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror128_32( + k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror128_32( + k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror128_32( + k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); - k10 = _mm256_xor_si256( mm256_ror128_32( + k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror128_32( + k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ); + k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ); k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1, ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror128_32( + k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index 2c93df9..0184ee8 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -12,8 +12,8 @@ static const uint32_t IV512[] = }; #define mm512_ror2x512hi_1x32( a, b ) \ - _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \ - mm512_ror128_32( b ) ) + _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \ + mm512_shuflr128_32( b ) ) static void c512_4way( shavite512_4way_context *ctx, const void *msg ) @@ -60,7 +60,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) { // round 1, 5, 9 - K0 = _mm512_xor_si512( K7, mm512_ror128_32( + K0 = _mm512_xor_si512( K7, mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ) ); if ( r == 0 ) @@ -69,33 +69,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); K1 = _mm512_xor_si512( K0, - mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); if ( r == 1 ) - K1 = _mm512_xor_si512( K1, mm512_ror128_32( + K1 = _mm512_xor_si512( K1, mm512_shuflr128_32( _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); K2 = _mm512_xor_si512( K1, - mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); K3 = _mm512_xor_si512( K2, - mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_xor_si512( P3, X ); K4 = _mm512_xor_si512( K3, - mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); K5 = _mm512_xor_si512( K4, - mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); K6 = _mm512_xor_si512( K5, - mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); K7 = _mm512_xor_si512( K6, - mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); if ( r == 2 ) K7 = _mm512_xor_si512( K7, mm512_swap128_64( @@ -130,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) // round 3, 7, 11 - K0 = _mm512_xor_si512( mm512_ror128_32( + K0 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero ); - K1 = _mm512_xor_si512( mm512_ror128_32( + K1 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( mm512_ror128_32( + K2 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( mm512_ror128_32( + K3 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P1 = _mm512_xor_si512( P1, X ); - K4 = _mm512_xor_si512( mm512_ror128_32( + K4 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero ); - K5 = _mm512_xor_si512( mm512_ror128_32( + K5 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = _mm512_xor_si512( mm512_ror128_32( + K6 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ), K5 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7 = _mm512_xor_si512( mm512_ror128_32( + K7 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); @@ -187,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) // round 13 - K0 = _mm512_xor_si512( mm512_ror128_32( + K0 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); - K1 = _mm512_xor_si512( mm512_ror128_32( + K1 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - K2 = _mm512_xor_si512( mm512_ror128_32( + K2 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - K3 = _mm512_xor_si512( mm512_ror128_32( + K3 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_xor_si512( P3, X ); - K4 = _mm512_xor_si512( mm512_ror128_32( + K4 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); - K5 = _mm512_xor_si512( mm512_ror128_32( + K5 = _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); + K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - K7= _mm512_xor_si512( mm512_ror128_32( + K7= _mm512_xor_si512( mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index a593cf5..d8f6feb 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -74,15 +74,15 @@ static const sph_u32 IV512[] = { #endif +/* #if defined(__AVX2__) // 2 way version of above // a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] } - #define mm256_ror2x256hi_1x32( a, b ) \ _mm256_blend_epi32( mm256_ror256_1x32( a ), \ mm256_rol256_3x32( b ), 0x88 ) - #endif +*/ static void c512( sph_shavite_big_context *sc, const void *msg ) @@ -135,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg ) for ( r = 0; r < 3; r ++ ) { // round 1, 5, 9 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); if ( r == 0 ) @@ -144,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg ) x = _mm_xor_si128( p0, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); if ( r == 1 ) @@ -153,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg ) x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); if ( r == 2 ) @@ -222,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 3, 7, 11 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p2, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p1 = _mm_xor_si128( p1, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p0, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); @@ -295,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round 13 - k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) ); + k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p0, k00 ); x = _mm_aesenc_si128( x, zero ); - k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); + k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); x = _mm_aesenc_si128( x, zero ); - k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) ); + k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); x = _mm_aesenc_si128( x, zero ); - k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) ); + k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); x = _mm_aesenc_si128( x, zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) ); + k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); x = _mm_aesenc_si128( x, zero ); - k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) ); + k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); x = _mm_aesenc_si128( x, zero ); - k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) ); + k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); x = _mm_xor_si128( x, k12 ); x = _mm_aesenc_si128( x, zero ); - k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) ); + k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); x = _mm_aesenc_si128( x, zero ); diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index a12af43..5a7cdbd 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -3,7 +3,7 @@ #include #include "skein-hash-4way.h" #include "algo/sha/sha-hash-4way.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #if defined (SKEIN_8WAY) @@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input ) uint32_t hash1[16] __attribute__ ((aligned (64))); uint32_t hash2[16] __attribute__ ((aligned (64))); uint32_t hash3[16] __attribute__ ((aligned (64))); - sph_sha256_context ctx_sha256; #else uint32_t vhash32[16*4] __attribute__ ((aligned (64))); sha256_4way_context ctx_sha256; @@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input ) #if defined(__SHA__) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash0, 64 ); - sph_sha256_close( &ctx_sha256, hash0 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash1, 64 ); - sph_sha256_close( &ctx_sha256, hash1 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash2, 64 ); - sph_sha256_close( &ctx_sha256, hash2 ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash3, 64 ); - sph_sha256_close( &ctx_sha256, hash3 ); + + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); + intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 ); #else diff --git a/algo/skein/skein.c b/algo/skein/skein.c index 91eb325..be9bb82 100644 --- a/algo/skein/skein.c +++ b/algo/skein/skein.c @@ -5,21 +5,18 @@ #include #include #include "sph_skein.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" void skeinhash(void *state, const void *input) { uint32_t hash[16] __attribute__ ((aligned (64))); sph_skein512_context ctx_skein; - sph_sha256_context ctx_sha256; sph_skein512_init( &ctx_skein ); sph_skein512( &ctx_skein, input, 80 ); sph_skein512_close( &ctx_skein, hash ); - sph_sha256_init( &ctx_sha256 ); - sph_sha256( &ctx_sha256, hash, 64 ); - sph_sha256_close( &ctx_sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy(state, hash, 32); } @@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input) int scanhash_skein( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t hash64[8] __attribute__ ((aligned (64))); uint32_t endiandata[20] __attribute__ ((aligned (64))); const uint32_t Htarg = ptarget[7]; @@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; int thr_id = mythr->id; // thr_id arg is deprecated - swab32_array( endiandata, pdata, 20 ); + swab32_array( endiandata, pdata, 20 ); do { be32enc(&endiandata[19], n); diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c index 0d971f2..8880b45 100644 --- a/algo/verthash/Verthash.c +++ b/algo/verthash/Verthash.c @@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p ) */ } #endif - -static inline uint32_t rotl32( uint32_t a, size_t r ) -{ - return ( a << r ) | ( a >> (32-r) ); -} - // Vectorized and targetted version of fnv1a #if defined (__AVX2__) @@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ { \ const uint32_t *blob_off = blob + \ - ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \ + ( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \ * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \ UPDATE_ACCUMULATOR; \ MULXOR; \ diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index a010344..ec808f6 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -1,5 +1,5 @@ #include "algo-gate-api.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "Verthash.h" #include "tiny_sha3/sha3-4way.h" @@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate ) uint8_t vhDataFileHash[32] = { 0 }; applog( LOG_NOTICE, "Verifying Verthash data" ); - sph_sha256_full( vhDataFileHash, verthashInfo.data, + sha256_full( vhDataFileHash, verthashInfo.data, verthashInfo.dataSize ); if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes, sizeof(verthashDatFileHash_bytes) ) == 0 ) diff --git a/algo/whirlpool/whirlpool.c b/algo/whirlpool/whirlpool.c index 59fcf71..1c6b688 100644 --- a/algo/whirlpool/whirlpool.c +++ b/algo/whirlpool/whirlpool.c @@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce, be32enc(&endiandata[19], n ); whirlpool_hash(vhash, endiandata); - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark ) submit_solution( work, vhash, mythr ); } while ( n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 2973952..8d4fb05 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata ) break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); - cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); - intrlv_8x64( vdata, edata, edata, edata, edata, - edata, edata, edata, edata, 640 ); + intrlv_4x128( vdata2, edata, edata, edata, edata, 640 ); + cube_4way_init( &x16r_ctx.cube, 512, 16, 32 ); + cube_4way_update( &x16r_ctx.cube, vdata2, 64 ); + rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); break; case HAMSI: mm512_bswap32_intrlv80_8x64( vdata, pdata ); @@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) case LUFFA: if ( i == 0 ) { - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa_4way_update_close( &ctx.luffa, vhash, - vhash + (16<<2), 16 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa_4way_update_close( &ctx.luffa, vhash, - vhash + (16<<2), 16 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } else { @@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) case CUBEHASH: if ( i == 0 ) { - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)in0 + 64, 16 ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_update_close( &ctx.cube, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)in1 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)in2 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)in3 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash4, - (const byte*)in4 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash5, - (const byte*)in5 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash6, - (const byte*)in6 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash7, - (const byte*)in7 + 64, 16 ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_update_close( &ctx.cube, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } else { - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash4, - (const byte*)in4, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash5, - (const byte*)in5, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash6, - (const byte*)in6, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash7, - (const byte*)in7, size ); + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); } break; case SHAVITE: @@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata ) break; case CUBEHASH: mm128_bswap32_80( edata, pdata ); - cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); - cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); - intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + intrlv_2x128( vdata2, edata, edata, 640 ); + cube_2way_init( &x16r_ctx.cube, 512, 16, 32 ); + cube_2way_update( &x16r_ctx.cube, vdata2, 64 ); + rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 ); break; case HAMSI: mm256_bswap32_intrlv80_4x64( vdata, pdata ); @@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) case LUFFA: if ( i == 0 ) { - intrlv_2x128( vhash, hash0, hash1, 640 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); - dintrlv_2x128_512( hash0, hash1, vhash ); - intrlv_2x128( vhash, hash2, hash3, 640 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); - dintrlv_2x128_512( hash2, hash3, vhash ); + intrlv_2x128( vhash, hash0, hash1, 640 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, hash2, hash3, 640 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash2, hash3, vhash ); } else { @@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) case CUBEHASH: if ( i == 0 ) { - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)in0 + 64, 16 ); + intrlv_2x128( vhash, in0, in1, size<<3 ); + cube_2way_update_close( &ctx.cube, vhash, + vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash0, hash1, vhash ); memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2 + 64, 16 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3 + 64, 16 ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + cube_2way_update_close( &ctx.cube, vhash, + vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash2, hash3, vhash ); } else { - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + intrlv_2x128( vhash, in0, in1, size<<3 ); + cube_2way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + cube_2way_full( &ctx.cube, vhash, 512, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); } break; case SHAVITE: diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 09315f6..3a94344 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -1,4 +1,5 @@ #include "x16r-gate.h" +#include "algo/sha/sha256d.h" __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 }; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 748b7fa..76ca5e7 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -37,6 +37,7 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -115,7 +116,7 @@ union _x16r_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cubehashParam cube; + cube_4way_context cube; simd_4way_context simd; hamsi512_8way_context hamsi; hashState_fugue fugue; @@ -164,8 +165,8 @@ union _x16r_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; + cube_2way_context cube; hashState_luffa luffa1; - cubehashParam cube; simd_2way_context simd; hamsi512_4way_context hamsi; hashState_fugue fugue; diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index de2dbe6..2f27116 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -13,7 +13,7 @@ #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif #if defined (X21S_8WAY) @@ -208,9 +208,7 @@ union _x21s_4way_context_overlay haval256_5_4way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(__SHA__) - sph_sha256_context sha256; -#else +#if !defined(__SHA__) sha256_4way_context sha256; #endif } __attribute__ ((aligned (64))); @@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid ) #if defined(__SHA__) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); + sha256_full( output, hash0, 64 ); + sha256_full( output+32, hash1, 64 ); + sha256_full( output+64, hash2, 64 ); + sha256_full( output+96, hash3, 64 ); #else diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index b81c07e..96782e2 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -8,7 +8,7 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" @@ -23,7 +23,7 @@ union _x21s_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; + sha256_context sha256; }; typedef union _x21s_context_overlay x21s_context_overlay; @@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid ) sph_gost512 ( &ctx.gost, (const void*) hash, 64 ); sph_gost512_close( &ctx.gost, (void*) hash ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash, 64 ); - sph_sha256_close( &ctx.sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy( output, hash, 32 ); diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index fcff0b6..1902a2d 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -37,7 +37,8 @@ union _x17_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; +// cube_4way_context cube; + cube_4way_2buf_context cube; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 ); luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); - cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); + cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 ); + +// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); +// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); #if defined(__VAES__) diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 94b34cc..5acf3de 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -28,7 +28,7 @@ #include "algo/echo/echo-hash-4way.h" #endif #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif #if defined(X22I_8WAY) @@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay haval256_5_8way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(X22I_8WAY_SHA) - sph_sha256_context sha256; -#else +#if !defined(X22I_8WAY_SHA) sha256_8way_context sha256; #endif #if defined(__VAES__) @@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) #if defined(X22I_8WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash4, 64 ); - sph_sha256_close( &ctx.sha256, output+128 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash5, 64 ); - sph_sha256_close( &ctx.sha256, output+160 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash6, 64 ); - sph_sha256_close( &ctx.sha256, output+192 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash7, 64 ); - sph_sha256_close( &ctx.sha256, output+224 ); + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); + sha256_full( hash4, hash4, 64 ); + sha256_full( hash5, hash5, 64 ); + sha256_full( hash6, hash6, 64 ); + sha256_full( hash7, hash7, 64 ); #else @@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay haval256_5_4way_context haval; sph_tiger_context tiger; sph_gost512_context gost; -#if defined(X22I_4WAY_SHA) - sph_sha256_context sha256; -#else +#if !defined(X22I_4WAY_SHA) sha256_4way_context sha256; #endif }; @@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) #if defined(X22I_4WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0, 64 ); - sph_sha256_close( &ctx.sha256, output ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1, 64 ); - sph_sha256_close( &ctx.sha256, output+32 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2, 64 ); - sph_sha256_close( &ctx.sha256, output+64 ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3, 64 ); - sph_sha256_close( &ctx.sha256, output+96 ); + sha256_full( hash0, hash0, 64 ); + sha256_full( hash1, hash1, 64 ); + sha256_full( hash2, hash2, 64 ); + sha256_full( hash3, hash3, 64 ); #else diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c index 759e44c..d63ddf2 100644 --- a/algo/x22/x22i.c +++ b/algo/x22/x22i.c @@ -24,6 +24,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -57,7 +58,6 @@ union _x22i_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; }; typedef union _x22i_context_overlay x22i_context_overlay; @@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid ) sph_gost512 (&ctx.gost, (const void*) hash, 64); sph_gost512_close(&ctx.gost, (void*) hash); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash, 64 ); - sph_sha256_close( &ctx.sha256, hash ); + sha256_full( hash, hash, 64 ); memcpy(output, hash, 32); diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index 86f5699..ff2888e 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -33,7 +33,7 @@ #include "algo/echo/echo-hash-4way.h" #endif #if defined(__SHA__) - #include "algo/sha/sph_sha2.h" + #include "algo/sha/sha256-hash.h" #endif void x25x_shuffle( void *hash ) @@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_8WAY_SHA) - sph_sha256_context sha256; + sha256_context sha256; #else sha256_8way_context sha256; #endif @@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) #if defined(X25X_8WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0[20], 64 ); - sph_sha256_close( &ctx.sha256, hash0[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1[20], 64 ); - sph_sha256_close( &ctx.sha256, hash1[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2[20], 64 ); - sph_sha256_close( &ctx.sha256, hash2[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3[20], 64 ); - sph_sha256_close( &ctx.sha256, hash3[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash4[20], 64 ); - sph_sha256_close( &ctx.sha256, hash4[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash5[20], 64 ); - sph_sha256_close( &ctx.sha256, hash5[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash6[20], 64 ); - sph_sha256_close( &ctx.sha256, hash6[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash7[20], 64 ); - sph_sha256_close( &ctx.sha256, hash7[21] ); - + sha256_full( hash0[21], hash0[20], 64 ); + sha256_full( hash1[21], hash1[20], 64 ); + sha256_full( hash2[21], hash2[20], 64 ); + sha256_full( hash3[21], hash3[20], 64 ); + sha256_full( hash4[21], hash4[20], 64 ); + sha256_full( hash5[21], hash5[20], 64 ); + sha256_full( hash6[21], hash6[20], 64 ); + sha256_full( hash7[21], hash7[20], 64 ); + intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21], hash4[21], hash5[21], hash6[21], hash7[21] ); @@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_4WAY_SHA) - sph_sha256_context sha256; + sha256_context sha256; #else sha256_4way_context sha256; #endif @@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) #if defined(X25X_4WAY_SHA) - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash0[20], 64 ); - sph_sha256_close( &ctx.sha256, hash0[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash1[20], 64 ); - sph_sha256_close( &ctx.sha256, hash1[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash2[20], 64 ); - sph_sha256_close( &ctx.sha256, hash2[21] ); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, hash3[20], 64 ); - sph_sha256_close( &ctx.sha256, hash3[21] ); + sha256_full( hash0[21], hash0[20], 64 ); + sha256_full( hash1[21], hash1[20], 64 ); + sha256_full( hash2[21], hash2[20], 64 ); + sha256_full( hash3[21], hash3[20], 64 ); intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] ); diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index 42e7eda..aade6e2 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -23,7 +23,7 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -60,7 +60,7 @@ union _x25x_context_overlay sph_haval256_5_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sph_sha256_context sha256; + sha256_context sha256; sph_panama_context panama; blake2s_state blake2s; }; @@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid ) sph_gost512 (&ctx.gost, (const void*) &hash[19], 64); sph_gost512_close(&ctx.gost, (void*) &hash[20]); - sph_sha256_init( &ctx.sha256 ); - sph_sha256( &ctx.sha256, &hash[20], 64 ); - sph_sha256_close( &ctx.sha256, &hash[21] ); + sha256_full( &hash[21], &hash[20], 64 ); sph_panama_init(&ctx.panama); sph_panama (&ctx.panama, (const void*) &hash[21], 64 ); diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c index 407d2dd..dc6eee6 100644 --- a/algo/yespower/crypto/blake2b-yp.c +++ b/algo/yespower/crypto/blake2b-yp.c @@ -35,9 +35,11 @@ #include "blake2b-yp.h" // Cyclic right rotation. -#ifndef ROTR64 -#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) -#endif +//#ifndef ROTR64 +//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +//#endif + +#define ROTR64(x, y) ror64( x, y ) // Little-endian byte access. #define B2B_GET64(p) \ diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c index 27d1fd8..b278c36 100644 --- a/algo/yespower/yescrypt-r8g.c +++ b/algo/yespower/yescrypt-r8g.c @@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce, endiandata[19] = n; // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); + sha256_ctx_init( &sha256_prehash_ctx ); + sha256_update( &sha256_prehash_ctx, endiandata, 64 ); do { yespower_tls( (unsigned char *)endiandata, params.perslen, diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 8c9a944..8968037 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -27,14 +27,11 @@ * coin. */ #include "yespower.h" - #include "algo-gate-api.h" yespower_params_t yespower_params; -//SHA256_CTX sha256_prehash_ctx; -__thread sph_sha256_context sha256_prehash_ctx; -//__thread SHA256_CTX sha256_prehash_ctx; +__thread sha256_context sha256_prehash_ctx; // YESPOWER @@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, endiandata[19] = n; // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); + sha256_ctx_init( &sha256_prehash_ctx ); + sha256_update( &sha256_prehash_ctx, endiandata, 64 ); do { if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) ) @@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce, be32enc( &endiandata[k], pdata[k] ); endiandata[19] = n; - // do sha256 prehash - sph_sha256_init( &sha256_prehash_ctx ); - sph_sha256( &sha256_prehash_ctx, endiandata, 64 ); - do { if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) ) if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark ) diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index fd16c24..5e725af 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, ARX(X0, X3, X2, 18) \ /* Rearrange data */ \ X1 = _mm_shuffle_epi32(X1, 0x93); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ /* Operate on "rows" */ \ ARX(X3, X0, X1, 7) \ ARX(X2, X3, X0, 9) \ ARX(X1, X2, X3, 13) \ ARX(X0, X1, X2, 18) \ /* Rearrange data */ \ + X3 = _mm_shuffle_epi32(X3, 0x93); \ X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); /** * Apply the Salsa20 core to the block provided in (X0 ... X3). @@ -1095,7 +1095,7 @@ int yespower(yespower_local_t *local, salsa20_blk_t *V, *XY; pwxform_ctx_t ctx; uint8_t sha256[32]; - sph_sha256_context sha256_ctx; + sha256_context sha256_ctx; /* Sanity-check parameters */ if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0) @@ -1138,10 +1138,9 @@ int yespower(yespower_local_t *local, // copy prehash, do tail memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx ); - - sph_sha256( &sha256_ctx, src+64, srclen-64 ); - sph_sha256_close( &sha256_ctx, sha256 ); - + sha256_update( &sha256_ctx, src+64, srclen-64 ); + sha256_final( &sha256_ctx, sha256 ); + if ( version == YESPOWER_0_5 ) { PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size ); @@ -1186,7 +1185,9 @@ int yespower(yespower_local_t *local, if ( work_restart[thrid].restart ) return 0; smix_1_0( B, r, N, V, XY, &ctx ); - + + if ( work_restart[thrid].restart ) return 0; + HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256), (uint8_t *)dst ); } diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h index 260322a..aa19004 100644 --- a/algo/yespower/yespower.h +++ b/algo/yespower/yespower.h @@ -34,7 +34,7 @@ #include /* for size_t */ #include "miner.h" #include "simd-utils.h" -#include "algo/sha/sph_sha2.h" +#include "algo/sha/sha256-hash.h" #ifdef __cplusplus extern "C" { @@ -78,9 +78,7 @@ typedef struct { extern yespower_params_t yespower_params; -//SHA256_CTX sha256_prehash_ctx; -extern __thread sph_sha256_context sha256_prehash_ctx; -//extern __thread SHA256_CTX sha256_prehash_ctx; +extern __thread sha256_context sha256_prehash_ctx; /** * yespower_init_local(local): diff --git a/build-allarch.sh b/build-allarch.sh index c4d9ffd..5fa38f6 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null # Icelake AVX512 SHA VAES make distclean || echo clean diff --git a/configure b/configure index 7430186..db3efc9 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.17.1' -PACKAGE_STRING='cpuminer-opt 3.17.1' +PACKAGE_VERSION='3.18.0' +PACKAGE_STRING='cpuminer-opt 3.18.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.17.1 +cpuminer-opt configure 3.18.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.17.1, which was +It was created by cpuminer-opt $as_me 3.18.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.17.1' + VERSION='3.18.0' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.17.1, which was +This file was extended by cpuminer-opt $as_me 3.18.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.17.1 +cpuminer-opt config.status 3.18.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 332d1e6..fbe5a9b 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.17.1]) +AC_INIT([cpuminer-opt], [3.18.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index e9c01fe..c889538 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -38,6 +38,7 @@ #include #include #include "sysinfos.c" +#include "algo/sha/sha256d.h" #ifdef WIN32 #include @@ -94,6 +95,7 @@ bool have_gbt = true; bool allow_getwork = true; bool want_stratum = true; // pretty useless bool have_stratum = false; +bool stratum_down = true; bool allow_mininginfo = true; bool use_syslog = false; bool use_colors = true; @@ -166,6 +168,8 @@ uint32_t stale_share_count = 0; uint32_t solved_block_count = 0; double *thr_hashrates; double global_hashrate = 0.; +double total_hashes = 0.; +struct timeval total_hashes_time = {0,0}; double stratum_diff = 0.; double net_diff = 0.; double net_hashrate = 0.; @@ -1001,6 +1005,7 @@ struct share_stats_t double share_diff; double stratum_diff; double target_diff; + uint32_t height; char job_id[32]; }; @@ -1080,13 +1085,14 @@ void report_summary_log( bool force ) pthread_mutex_unlock( &stats_lock ); timeval_subtract( &et, &now, &start_time ); - timeval_subtract( &uptime, &now, &session_start ); + timeval_subtract( &uptime, &total_hashes_time, &session_start ); double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6; - double ghrate = global_hashrate; + double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. ); double target_diff = exp32 * last_targetdiff; double shrate = safe_div( target_diff * (double)(accepts), share_time, 0. ); +// global_hashrate = ghrate; double sess_hrate = safe_div( exp32 * norm_diff_sum, (double)uptime.tv_sec, 0. ); double submit_rate = safe_div( (double)submits * 60., share_time, 0. ); @@ -1134,29 +1140,38 @@ void report_summary_log( bool force ) 100. * safe_div( (double)accepted_share_count, (double)submitted_share_count, 0. ) ); if ( stale_share_count ) - applog2( LOG_INFO, "Stale %7d %7d %5.1f%%", + { + int prio = stales ? LOG_MINR : LOG_INFO; + applog2( prio, "Stale %7d %7d %5.1f%%", stales, stale_share_count, 100. * safe_div( (double)stale_share_count, (double)submitted_share_count, 0. ) ); + } if ( rejected_share_count ) - applog2( LOG_INFO, "Rejected %7d %7d %5.1f%%", + { + int prio = rejects ? LOG_ERR : LOG_INFO; + applog2( prio, "Rejected %7d %7d %5.1f%%", rejects, rejected_share_count, 100. * safe_div( (double)rejected_share_count, (double)submitted_share_count, 0. ) ); + } if ( solved_block_count ) - applog2( LOG_INFO,"Blocks Solved %7d %7d", + { + int prio = solved ? LOG_PINK : LOG_INFO; + applog2( prio, "Blocks Solved %7d %7d", solved, solved_block_count ); + } applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g", - highest_share, lowest_share ); + highest_share, lowest_share ); int mismatch = submitted_share_count - ( accepted_share_count + stale_share_count + rejected_share_count ); if ( mismatch ) { if ( mismatch != 1 ) - applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch ); + applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch ); else - applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" ); + applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N ); } } @@ -1278,17 +1293,17 @@ static int share_result( int result, struct work *work, if ( use_colors ) { - bcol = acol = scol = rcol = CL_WHT; + bcol = acol = scol = rcol = CL_N; if ( likely( result ) ) { - acol = CL_WHT CL_GRN; - if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG; + acol = CL_LGR; + if ( unlikely( solved ) ) bcol = CL_LMA; } - else if ( stale ) scol = CL_WHT CL_YL2; - else rcol = CL_WHT CL_RED; + else if ( stale ) scol = CL_YL2; + else rcol = CL_LRD; } - applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)", + applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)", my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, bres, share_time, latency ); @@ -1296,8 +1311,7 @@ static int share_result( int result, struct work *work, { if ( have_stratum ) applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s", - my_stats.share_diff, stratum.block_height, - my_stats.job_id ); + my_stats.share_diff, my_stats.height, my_stats.job_id ); else applog2( LOG_INFO, "Diff %.5g, Block %d", my_stats.share_diff, work ? work->height : last_block_height ); @@ -1308,7 +1322,7 @@ static int share_result( int result, struct work *work, uint32_t str[8]; uint32_t *targ; - if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason ); + if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason ); diff_to_hash( str, my_stats.share_diff ); applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6], @@ -1861,6 +1875,7 @@ static void update_submit_stats( struct work *work, const void *hash ) share_stats[ s_put_ptr ].net_diff = net_diff; share_stats[ s_put_ptr ].stratum_diff = stratum_diff; share_stats[ s_put_ptr ].target_diff = work->targetdiff; + share_stats[ s_put_ptr ].height = work->height; if ( have_stratum ) strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 ); s_put_ptr = stats_ptr_incr( s_put_ptr ); @@ -1871,6 +1886,10 @@ static void update_submit_stats( struct work *work, const void *hash ) bool submit_solution( struct work *work, const void *hash, struct thr_info *thr ) { + // Job went stale during hashing of a valid share. + if ( !opt_quiet && work_restart[ thr->id ].restart ) + applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N ); + work->sharediff = hash_to_diff( hash ); if ( likely( submit_work( thr, work ) ) ) { @@ -1887,11 +1906,11 @@ bool submit_solution( struct work *work, const void *hash, if ( !opt_quiet ) { if ( have_stratum ) - applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s", + applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s", submitted_share_count, work->sharediff, work->height, work->job_id ); else - applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x", + applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x", submitted_share_count, work->sharediff, work->height, work->data[ algo_gate.ntime_index ] ); } @@ -2048,7 +2067,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_rwlock_wrlock( &g_work_lock ); pthread_mutex_lock( &sctx->work_lock ); - new_job = sctx->new_job; + new_job = sctx->new_job; // otherwise just increment extranonce2 sctx->new_job = false; free( g_work->job_id ); @@ -2084,6 +2103,14 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_mutex_unlock( &stats_lock ); + if ( !opt_quiet ) + { + int mismatch = submitted_share_count + - ( accepted_share_count + stale_share_count + rejected_share_count ); + if ( mismatch ) + applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count ); + } + if ( stratum_diff != sctx->job.diff ) applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s", sctx->job.diff, sctx->block_height, g_work->job_id ); @@ -2264,19 +2291,29 @@ static void *miner_thread( void *userdata ) } // wait for stratum to send first job - if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1); + if ( have_stratum ) while ( unlikely( stratum_down ) ) + { + if ( opt_debug ) + applog( LOG_INFO, "Thread %d waiting for first job", thr_id ); + sleep(1); + } + // nominal startng values + int64_t max64 = 20; + thr_hashrates[thr_id] = 20; while (1) { uint64_t hashes_done; struct timeval tv_start, tv_end, diff; - int64_t max64 = 1000; +// int64_t max64 = 1000; int nonce_found = 0; if ( likely( algo_gate.do_this_thread( thr_id ) ) ) { - if ( have_stratum ) + if ( have_stratum ) { + while ( unlikely( stratum_down ) ) + sleep( 1 ); if ( *nonceptr >= end_nonce ) stratum_gen_work( &stratum, &g_work ); } @@ -2383,6 +2420,8 @@ static void *miner_thread( void *userdata ) if ( diff.tv_usec || diff.tv_sec ) { pthread_mutex_lock( &stats_lock ); + total_hashes += hashes_done; + total_hashes_time = tv_end; thr_hashrates[thr_id] = hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 ); pthread_mutex_unlock( &stats_lock ); @@ -2439,7 +2478,6 @@ static void *miner_thread( void *userdata ) && thr_id == opt_n_threads - 1 ) ) { double hashrate = 0.; - pthread_mutex_lock( &stats_lock ); for ( i = 0; i < opt_n_threads; i++ ) hashrate += thr_hashrates[i]; @@ -2448,8 +2486,12 @@ static void *miner_thread( void *userdata ) if ( opt_benchmark ) { + struct timeval uptime; char hr[16]; char hr_units[2] = {0,0}; + timeval_subtract( &uptime, &total_hashes_time, &session_start ); + double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. ); + scale_hash_for_display( &hashrate, hr_units ); sprintf( hr, "%.2f", hashrate ); #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32)) @@ -2745,6 +2787,7 @@ static void *stratum_thread(void *userdata ) if ( unlikely( stratum_need_reset ) ) { stratum_need_reset = false; + stratum_down = true; stratum_disconnect( &stratum ); if ( strcmp( stratum.url, rpc_url ) ) { @@ -2755,11 +2798,13 @@ static void *stratum_thread(void *userdata ) else applog(LOG_WARNING, "Stratum connection reset"); // reset stats queue as well + restart_threads(); if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0; } while ( !stratum.curl ) { + stratum_down = true; pthread_rwlock_wrlock( &g_work_lock ); g_work_time = 0; pthread_rwlock_unlock( &g_work_lock ); @@ -2780,6 +2825,7 @@ static void *stratum_thread(void *userdata ) } else { + stratum_down = false; restart_threads(); applog(LOG_BLUE,"Stratum connection established" ); } @@ -2801,7 +2847,7 @@ static void *stratum_thread(void *userdata ) } else { - applog(LOG_WARNING, "Stratum connection interrupted"); +// applog(LOG_WARNING, "Stratum connection interrupted"); // stratum_disconnect( &stratum ); stratum_need_reset = true; } @@ -3629,6 +3675,10 @@ int main(int argc, char *argv[]) show_usage_and_exit(1); } + // need to register to get algo optimizations for cpu capabilities + // but that causes register logs before cpu capabilities is output. + // Would need to split register into 2 parts. First part sets algo + // optimizations but no logging, second part does any logging. if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); if ( !check_cpu_capability() ) exit(1); @@ -3685,12 +3735,6 @@ int main(int argc, char *argv[]) } } - // Initialize stats times and counters - memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) ); - gettimeofday( &last_submit_time, NULL ); - memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); - memcpy( &session_start, &last_submit_time, sizeof (struct timeval) ); - // if ( !check_cpu_capability() ) exit(1); pthread_mutex_init( &stats_lock, NULL ); @@ -3854,7 +3898,8 @@ int main(int argc, char *argv[]) return 1; } } - if ( have_stratum ) + + if ( have_stratum ) { if ( opt_debug ) applog(LOG_INFO,"Creating stratum thread"); @@ -3900,24 +3945,35 @@ int main(int argc, char *argv[]) opt_api_listen ); } + // hold the stats lock while starting miner threads + pthread_mutex_lock( &stats_lock ); + /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) + for ( i = 0; i < opt_n_threads; i++ ) { - usleep( 5000 ); +// usleep( 5000 ); thr = &thr_info[i]; thr->id = i; thr->q = tq_new(); - if (!thr->q) + if ( !thr->q ) return 1; - err = thread_create(thr, miner_thread); - if (err) { - applog(LOG_ERR, "Miner thread %d create failed", i); + err = thread_create( thr, miner_thread ); + if ( err ) + { + applog( LOG_ERR, "Miner thread %d create failed", i ); return 1; } } - applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm", - opt_n_threads, num_cpus, algo_names[opt_algo] ); + // Initialize stats times and counters + memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) ); + gettimeofday( &last_submit_time, NULL ); + memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); + memcpy( &session_start, &last_submit_time, sizeof (struct timeval) ); + pthread_mutex_unlock( &stats_lock ); + + applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm", + opt_n_threads, num_cpus, algo_names[opt_algo] ); /* main loop - simply wait for workio thread to exit */ pthread_join( thr_info[work_thr_id].pth, NULL ); diff --git a/miner.h b/miner.h index 9ca56b8..5592d4a 100644 --- a/miner.h +++ b/miner.h @@ -70,17 +70,25 @@ void *alloca (size_t); #ifdef HAVE_SYSLOG_H #include -#define LOG_BLUE 0x10 /* unique value */ +#define LOG_BLUE 0x10 /* unique value */ +#define LOG_MAJR 0x11 /* unique value */ +#define LOG_MINR 0x12 /* unique value */ +#define LOG_GREEN 0x13 /* unique value */ +#define LOG_PINK 0x14 /* unique value */ #else enum { - LOG_ERR, + LOG_CRIT, + LOG_ERR, LOG_WARNING, LOG_NOTICE, LOG_INFO, LOG_DEBUG, - /* custom notices */ - LOG_BLUE = 0x10, -}; + /* custom notices */ + LOG_BLUE = 0x10, + LOG_MAJR = 0x11, + LOG_MINR = 0x12, + LOG_GREEN = 0x13, + LOG_PINK = 0x14 }; #endif extern bool is_power_of_2( int n ); @@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err); void sha256_init(uint32_t *state); void sha256_transform(uint32_t *state, const uint32_t *block, int swap); -void sha256d(unsigned char *hash, const unsigned char *data, int len); +//void sha256d(unsigned char *hash, const unsigned char *data, int len); #ifdef USE_ASM #if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) @@ -225,7 +233,8 @@ int sha256_use_4way(); void sha256_init_4way(uint32_t *state); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); #endif -#if defined(__x86_64__) && defined(USE_AVX2) +//#if defined(__x86_64__) && defined(USE_AVX2) +#if defined(__x86_64__) && defined(__AVX2__) #define HAVE_SHA256_8WAY 1 int sha256_use_8way(); void sha256_init_8way(uint32_t *state); @@ -271,9 +280,9 @@ struct thr_api { #define CL_N "\x1B[0m" #define CL_RED "\x1B[31m" #define CL_GRN "\x1B[32m" -#define CL_YLW "\x1B[33m" +#define CL_YLW "\x1B[33m" // dark yellow #define CL_BLU "\x1B[34m" -#define CL_MAG "\x1B[35m" +#define CL_MAG "\x1B[35m" // purple #define CL_CYN "\x1B[36m" #define CL_BLK "\x1B[22;30m" /* black */ @@ -281,7 +290,7 @@ struct thr_api { #define CL_GR2 "\x1B[22;32m" /* green */ #define CL_BRW "\x1B[22;33m" /* brown */ #define CL_BL2 "\x1B[22;34m" /* blue */ -#define CL_MA2 "\x1B[22;35m" /* magenta */ +#define CL_MA2 "\x1B[22;35m" /* purple */ #define CL_CY2 "\x1B[22;36m" /* cyan */ #define CL_SIL "\x1B[22;37m" /* gray */ @@ -290,9 +299,9 @@ struct thr_api { #else #define CL_GRY "\x1B[90m" /* dark gray selectable in putty */ #endif -#define CL_LRD "\x1B[01;31m" /* light red */ -#define CL_LGR "\x1B[01;32m" /* light green */ -#define CL_YL2 "\x1B[01;33m" /* yellow */ +#define CL_LRD "\x1B[01;31m" /* bright red */ +#define CL_LGR "\x1B[01;32m" /* bright green */ +#define CL_YL2 "\x1B[01;33m" /* bright yellow */ #define CL_LBL "\x1B[01;34m" /* light blue */ #define CL_LMA "\x1B[01;35m" /* light magenta */ #define CL_LCY "\x1B[01;36m" /* light cyan */ @@ -481,7 +490,7 @@ void format_hashrate(double hashrate, char *output); void print_hash_tests(void); void scale_hash_for_display ( double* hashrate, char* units ); - +void format_number_si( double* hashrate, char* si_units ); void report_summary_log( bool force ); /* diff --git a/simd-utils.h b/simd-utils.h index 55cc552..f2e201d 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -78,6 +78,8 @@ // - specialized shift and rotate functions that move elements around // use the notation "1x32" to indicate the distance moved as units of // the element size. +// Vector shuffle rotations are being renamed to "vrol" and "vror" +// to avoid confusion with bit rotations. // - there is a subset of some functions for scalar data. They may have // no prefix nor vec-size, just one size, the size of the data. // - Some integer functions are also defined which use a similar notation. diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index cedcae3..956f3e3 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1, d0[24] = s[48]; d1[24] = s[49]; d0[25] = s[50]; d1[25] = s[51]; d0[26] = s[52]; d1[26] = s[53]; d0[27] = s[54]; d1[27] = s[55]; d0[28] = s[56]; d1[28] = s[57]; d0[29] = s[58]; d1[29] = s[59]; - d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[61]; d1[31] = s[63]; + d0[30] = s[60]; d1[30] = s[61]; d0[31] = s[62]; d1[31] = s[63]; } static inline void extr_lane_2x32( void *dst, const void *src, diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 90066f0..765d847 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -35,6 +35,13 @@ /////////////////////////////////////////////////////////////////////////// +// Used instead if casting. +typedef union +{ + __m128i m128; + uint32_t u32[4]; +} __attribute__ ((aligned (16))) m128_ovly; + // Efficient and convenient moving between GP & low bits of XMM. // Use VEX when available to give access to xmm8-15 and zero extend for // larger vectors. @@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) return a; } -static inline uint64_t mm128_mov128_64( const __m128i a ) +// Inconstant naming, prefix should reflect return value: +// u64_mov128_64 + +static inline uint64_t u64_mov128_64( const __m128i a ) { uint64_t n; #if defined(__AVX__) @@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a ) return n; } -static inline uint32_t mm128_mov128_32( const __m128i a ) +static inline uint32_t u32_mov128_32( const __m128i a ) { uint32_t n; #if defined(__AVX__) @@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i, // Extract 32 bit element c from v and return as integer. static inline uint32_t mm128_extract_32( const __m128i v, const int c ) -{ return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); } +{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); } // Clear (zero) 32 bit elements based on bits set in 4 bit mask. static inline __m128i mm128_mask_32( const __m128i v, const int m ) { return mm128_xim_32( v, v, m ); } +// Move element i2 of v2 to element i1 of v1. For reference and convenience, +// it's faster to precalculate the index. +#define mm128_shuflmov_32( v1, i1, v2, i2 ) \ + mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) ) + #endif // SSE4_1 // @@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #endif + + +// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] || + +// Blend 4 32 bit elements from 4 vectors + +#if defined (__AVX2__) + +#define mm128_diagonal_32( v3, v2, v1, v0 ) \ + mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \ + _mm_blend_epi32( s1, s0, 0x1 ), 0x3 ) + +#elif defined(__SSE4_1) + +#define mm128_diagonal_32( v3, v2, v1, v0 ) \ + mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \ + _mm_blend_epi16( s1, s0, 0x03 ), 0x0f ) + +#endif + + // // Bit rotations // AVX512VL has implemented bit rotation for 128 bit vectors with // 64 and 32 bit elements. +// x2 rotates elements in 2 individual vectors in a double buffered +// optimization for SSE2, does nothing for AVX512 but is there for +// transparency. + // compiler doesn't like when a variable is used for the last arg of // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same // specification but works with a variable. Therefore use rol_var where @@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_ror_32 _mm_ror_epi32 #define mm128_rol_32 _mm_rol_epi32 +#define mm128_rorx2_64( v1, v0, c ) \ + _mm_ror_epi64( v0, c ); \ + _mm_ror_epi64( v1, c ) + +#define mm128_rolx2_64( v1, v0, c ) \ + _mm_rol_epi64( v0, c ); \ + _mm_rol_epi64( v1, c ) + +#define mm128_rorx2_32( v1, v0, c ) \ + _mm_ror_epi32( v0, c ); \ + _mm_ror_epi32( v1, c ) + +#define mm128_rolx2_32( v1, v0, c ) \ + _mm_rol_epi32( v0, c ); \ + _mm_rol_epi32( v1, c ) + #else // SSE2 #define mm128_ror_64 mm128_ror_var_64 @@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_ror_32 mm128_ror_var_32 #define mm128_rol_32 mm128_rol_var_32 +#define mm128_rorx2_64( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_srli_epi64( v0, c ); \ + __m128i t1 = _mm_srli_epi64( v1, c ); \ + v0 = _mm_slli_epi64( v0, 64-(c) ); \ + v1 = _mm_slli_epi64( v1, 64-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rolx2_64( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_slli_epi64( v0, c ); \ + __m128i t1 = _mm_slli_epi64( v1, c ); \ + v0 = _mm_srli_epi64( v0, 64-(c) ); \ + v1 = _mm_srli_epi64( v1, 64-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rorx2_32( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_srli_epi32( v0, c ); \ + __m128i t1 = _mm_srli_epi32( v1, c ); \ + v0 = _mm_slli_epi32( v0, 32-(c) ); \ + v1 = _mm_slli_epi32( v1, 32-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + +#define mm128_rolx2_32( v1, v0, c ) \ +{ \ + __m128i t0 = _mm_slli_epi32( v0, c ); \ + __m128i t1 = _mm_slli_epi32( v1, c ); \ + v0 = _mm_srli_epi32( v0, 32-(c) ); \ + v1 = _mm_srli_epi32( v1, 32-(c) ); \ + v0 = _mm_or_si256( v0, t0 ); \ + v1 = _mm_or_si256( v1, t1 ); \ +} + #endif // AVX512 else SSE2 #define mm128_ror_16( v, c ) \ @@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // Rotate vector elements accross all lanes #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) -#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) -#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) +#define mm128_shuflr_64 mm128_swap_64 +#define mm128_shufll_64 mm128_swap_64 + +#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 ) +#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 ) + // Swap 32 bit elements in 64 bit lanes #define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) +#define mm128_shuflr64_32 mm128_swap64_32 +#define mm128_shufll64_32 mm128_swap64_32 #if defined(__SSSE3__) // Rotate right by c bytes, no SSE2 equivalent. -static inline __m128i mm128_ror_x8( const __m128i v, const int c ) +static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) { return _mm_alignr_epi8( v, v, c ); } // @@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) v1 = _mm_xor_si128( v1, v2 ); +// Two input shuffle-rotate. // Concatenate v1 & v2 and rotate as one 256 bit vector. -#if defined(__SSE4_1__) +// Continue to use vror/vrol for now to avoid confusion with +// shufl2r/shufl2l function macros available with AVX512. -#define mm128_ror256_64( v1, v2 ) \ +#if defined(__SSSE3__) + +// Function macro with two inputs and one output, inputs are preserved. +// Returns modified first arg. +// Two input functions are not available without SSSE3. Use procedure +// belowe instead. + +#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) +#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) + +#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 ) +#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) + +#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 ) +#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 ) + +#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) +#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) + +// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed. +// Returns both modified args in place. + +// These macros retain the vrol/vror name for now to avoid +// confusion with the shufl2r/shuffle2l function macros above. +// These may be renamed to something like shufl2r2 for 2 1nputs and +// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs. + +#define mm128_vror256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v1 = _mm_alignr_epi8( v2, v1, 8 ); \ v2 = t; \ } while(0) -#define mm128_rol256_64( v1, v2 ) \ +#define mm128_vrol256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v2 = _mm_alignr_epi8( v2, v1, 8 ); \ v1 = t; \ } while(0) -#define mm128_ror256_32( v1, v2 ) \ +#define mm128_vror256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ v1 = _mm_alignr_epi8( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm128_rol256_32( v1, v2 ) \ +#define mm128_vrol256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ v2 = _mm_alignr_epi8( v2, v1, 12 ); \ v1 = t; \ } while(0) -#define mm128_ror256_16( v1, v2 ) \ +#define mm128_vror256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ v1 = _mm_alignr_epi8( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm128_rol256_16( v1, v2 ) \ +#define mm128_vrol256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ v2 = _mm_alignr_epi8( v2, v1, 14 ); \ v1 = t; \ } while(0) -#define mm128_ror256_8( v1, v2 ) \ +#define mm128_vror256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ v1 = _mm_alignr_epi8( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm128_rol256_8( v1, v2 ) \ +#define mm128_vrol256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ v2 = _mm_alignr_epi8( v2, v1, 15 ); \ @@ -483,7 +614,7 @@ do { \ #else // SSE2 -#define mm128_ror256_64( v1, v2 ) \ +#define mm128_vror256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ _mm_slli_si128( v2, 8 ) ); \ @@ -492,7 +623,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_64( v1, v2 ) \ +#define mm128_vrol256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ _mm_srli_si128( v2, 8 ) ); \ @@ -501,7 +632,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_32( v1, v2 ) \ +#define mm128_vror256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ _mm_slli_si128( v2, 12 ) ); \ @@ -510,7 +641,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_32( v1, v2 ) \ +#define mm128_vrol256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ _mm_srli_si128( v2, 12 ) ); \ @@ -519,7 +650,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_16( v1, v2 ) \ +#define mm128_vror256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ _mm_slli_si128( v2, 14 ) ); \ @@ -528,7 +659,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_16( v1, v2 ) \ +#define mm128_vrol256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ _mm_srli_si128( v2, 14 ) ); \ @@ -537,7 +668,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror256_8( v1, v2 ) \ +#define mm128_vror256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ _mm_slli_si128( v2, 15 ) ); \ @@ -546,7 +677,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol256_8( v1, v2 ) \ +#define mm128_vrol256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ _mm_srli_si128( v2, 15 ) ); \ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 125e2c8..3d84010 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -14,13 +14,28 @@ // is limited because 256 bit vectors are less likely to be used when 512 // is available. +// Used instead if casting. +typedef union +{ + __m256i m256; + __m128i m128[2]; + uint64_t u64[4]; + uint32_t u32[8]; +} __attribute__ ((aligned (32))) m256_ovly; + + // Move integer to low element of vector, other elements are set to zero. #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) ) #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) ) // Move low element of vector to integer. -#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) ) -#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) ) +#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) ) +#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) ) + +// deprecated +//#define mm256_mov256_64 u64_mov256_64 +//#define mm256_mov256_32 u32_mov256_32 + // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } #define mm256_concat_128( hi, lo ) \ @@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #endif +// Diagonal blending + +// Blend 4 64 bit elements from 4 vectors +#define mm256_diagonal_64( v3, v2, v1, v0 ) \ + mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \ + _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f ) + +// Blend 8 32 bit elements from 8 vectors +#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v7, v6, 0x40 ), \ + _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v3, v2, 0x04) \ + _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f ) + + +// Blend 4 32 bit elements from each 128 bit lane. +#define mm256_diagonal128_32( v3, v2, v1, v0 ) \ + _mm256_blend_epi32( \ + _mm256_blend_epi32( v3, v2, 0x44) \ + _mm256_blend_epi32( v1, v0, 0x11 ) ) + + // // Bit rotations. // -// The only bit shift for more than 64 bits is with __int128. +// The only bit shift for more than 64 bits is with __int128 which is slow. // // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements +// +// x2 rotates elements in 2 individual vectors in a double buffered +// optimization for SSE2, does nothing for AVX512 but is there for +// transparency. // compiler doesn't like when a variable is used for the last arg of @@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_ror_32 _mm256_ror_epi32 #define mm256_rol_32 _mm256_rol_epi32 +#define mm256_rorx2_64( v1, v0, c ) \ + _mm256_ror_epi64( v0, c ); \ + _mm256_ror_epi64( v1, c ) + +#define mm256_rolx2_64( v1, v0, c ) \ + _mm256_rol_epi64( v0, c ); \ + _mm256_rol_epi64( v1, c ) + +#define mm256_rorx2_32( v1, v0, c ) \ + _mm256_ror_epi32( v0, c ); \ + _mm256_ror_epi32( v1, c ) + +#define mm256_rolx2_32( v1, v0, c ) \ + _mm256_rol_epi32( v0, c ); \ + _mm256_rol_epi32( v1, c ) + #else // AVX2 #define mm256_ror_64 mm256_ror_var_64 @@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_ror_32 mm256_ror_var_32 #define mm256_rol_32 mm256_rol_var_32 +#define mm256_rorx2_64( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_srli_epi64( v0, c ); \ + __m256i t1 = _mm256_srli_epi64( v1, c ); \ + v0 = _mm256_slli_epi64( v0, 64-(c) ); \ + v1 = _mm256_slli_epi64( v1, 64-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rolx2_64( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi64( v0, c ); \ + __m256i t1 = _mm256_slli_epi64( v1, c ); \ + v0 = _mm256_srli_epi64( v0, 64-(c) ); \ + v1 = _mm256_srli_epi64( v1, 64-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rorx2_32( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_srli_epi32( v0, c ); \ + __m256i t1 = _mm256_srli_epi32( v1, c ); \ + v0 = _mm256_slli_epi32( v0, 32-(c) ); \ + v1 = _mm256_slli_epi32( v1, 32-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + +#define mm256_rolx2_32( v1, v0, c ) \ +{ \ + __m256i t0 = _mm256_slli_epi32( v0, c ); \ + __m256i t1 = _mm256_slli_epi32( v1, c ); \ + v0 = _mm256_srli_epi32( v0, 32-(c) ); \ + v1 = _mm256_srli_epi32( v1, 32-(c) ); \ + v0 = _mm256_or_si256( v0, t0 ); \ + v1 = _mm256_or_si256( v1, t1 ); \ +} + #endif // AVX512 else AVX2 #define mm256_ror_16( v, c ) \ @@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Rotate elements accross all lanes. -#if defined(__AVX512VL__) - -static inline __m256i mm256_swap_128( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 2 ); } - -static inline __m256i mm256_ror_1x64( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 1 ); } - -static inline __m256i mm256_rol_1x64( const __m256i v ) -{ return _mm256_alignr_epi64( v, v, 3 ); } - -static inline __m256i mm256_ror_1x32( const __m256i v ) -{ return _mm256_alignr_epi32( v, v, 1 ); } - -static inline __m256i mm256_rol_1x32( const __m256i v ) -{ return _mm256_alignr_epi32( v, v, 7 ); } - -#else // AVX2 - // Swap 128 bit elements in 256 bit vector. #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) +#define mm256_shuflr_128 mm256_swap_128 +#define mm256_shufll_128 mm256_swap_128 // Rotate 256 bit vector by one 64 bit element -#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) +#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 ) + +#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) // Rotate 256 bit vector by one 32 bit element. -#define mm256_ror_1x32( v ) \ +#define mm256_shuflr_32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000000000007, 0x0000000600000005, \ - 0x0000000400000003, 0x0000000200000001 ) + 0x0000000400000003, 0x0000000200000001 ) ) -#define mm256_rol_1x32( v ) \ +#define mm256_shufll_32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000600000005, 0x0000000400000003, \ - 0x0000000200000001, 0x0000000000000007 ) + 0x0000000200000001, 0x0000000000000007 ) ) -#endif // AVX512 else AVX2 - // // Rotate elements within each 128 bit lane of 256 bit vector. #define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) -#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) -#define mm256_rol128_32( v ) _mm256_shuffle_epi32( v, 0x93 ) +#define mm256_shuflr128_64 mm256_swap128_64 +#define mm256_shufll128_64 mm256_swap128_64 -static inline __m256i mm256_ror128_x8( const __m256i v, const int c ) +#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) +#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 ) + +static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) { return _mm256_alignr_epi8( v, v, c ); } // Swap 32 bit elements in each 64 bit lane. #define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) +#define mm256_shuflr64_32 mm256_swap64_32 +#define mm256_shufll64_32 mm256_swap64_32 // // Swap bytes in vector elements, endian bswap. @@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c ) // _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also // makes these macros unnecessary. +// continue using vror/vrol notation for now to avoid confusion with +// shufl2r/shufl2l macro functions available with AVX512. #define mm256_swap512_256( v1, v2 ) \ v1 = _mm256_xor_si256( v1, v2 ); \ v2 = _mm256_xor_si256( v1, v2 ); \ v1 = _mm256_xor_si256( v1, v2 ); -#define mm256_ror512_128( v1, v2 ) \ +#define mm256_vror512_128( v1, v2 ) \ do { \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ v1 = _mm256_permute2x128( v2, v1, 0x21 ); \ v2 = t; \ } while(0) -#define mm256_rol512_128( v1, v2 ) \ +#define mm256_vrol512_128( v1, v2 ) \ do { \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ v2 = _mm256_permute2x128( v2, v1, 0x21 ); \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index e6b7ac2..de948cc 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -74,13 +74,22 @@ // __AVX512VBMI__ __AVX512VAES__ // +// Used instead if casting. +typedef union +{ + __m512i m512; + __m128i m128[4]; + uint32_t u32[16]; + uint64_t u64[8]; +} __attribute__ ((aligned (64))) m512_ovly; + // Move integer to/from element 0 of vector. #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) ) #define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) ) -#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) ) -#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) ) +#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) ) +#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) ) // A simple 128 bit permute, using function instead of macro avoids // problems if the v arg passed as an expression. @@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c ) #define mm512_concat_256( hi, lo ) \ _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 ) +#define m512_const_128( v3, v2, v1, v0 ) \ + mm512_concat_256( mm256_concat_128( v3, v2 ), \ + mm256_concat_128( v1, v0 ) ) + // Equivalent of set, assign 64 bit integers to respective 64 bit elements. // Use stack memory overlay static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, @@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Ternary logic uses 8 bit truth table to define any 3 input logical -// operation using any number or combinations of AND, OR XOR, NOT. +// expression using any number or combinations of AND, OR, XOR, NOT. // a ^ b ^ c #define mm512_xor3( a, b, c ) \ @@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_andxor( a, b, c ) \ _mm512_ternarylogic_epi64( a, b, c, 0x60 ) -// a ^ ( b & c ) +// a ^ ( b | c ) #define mm512_xoror( a, b, c ) \ _mm512_ternarylogic_epi64( a, b, c, 0x1e ) -// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ] +// a ^ ( ~b & c ) xor( a, andnot( b, c ) ) #define mm512_xorandnot( a, b, c ) \ _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) @@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // Some 2 input operations that don't have their own instruction mnemonic. -// ~( a | b ) +// ~( a | b ), (~a) & (~b) #define mm512_nor( a, b ) \ _mm512_ternarylogic_epi64( a, b, b, 0x01 ) -// ~( a ^ b ), same as (~a) ^ b +// ~( a ^ b ), (~a) ^ b #define mm512_xnor( a, b ) \ _mm512_ternarylogic_epi64( a, b, b, 0x81 ) @@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) _mm512_ternarylogic_epi64( a, b, b, 0xef ) +// Diagonal blending +// Blend 8 64 bit elements from 8 vectors +#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \ + _mm512_mask_blend_epi64( 0x0f, \ + _mm512_mask_blend_epi64( 0x30, \ + _mm512_mask_blend_epi64( 0x40, v7, v6 ), \ + _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \ + _mm512_mask_blend_epi64( 0x03, \ + _mm512_mask_blend_epi64( 0x04, v3, v2 ) \ + _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) ) + + +// Blend 4 32 bit elements from each 128 bit lane. +#define mm512_diagonal128_32( v3, v2, v1, v0 ) \ + _mm512_mask_blend_epi32( 0x3333, \ + _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \ + _mm512_mask_blend_epi32( 0x1111, v1, v0 ) ) + + + + // Bit rotations. // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit @@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c ) casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ } while(0) - // -// Rotate elements in 512 bit vector. +// Shift with zero fill & shuffle-rotate elements in 512 bit vector. +// + +// rename plan change ror to vror for Vector ROtate Right, +// and vrol for Vector ROtate Left, not to be confused with +//variable rotate rorv, rolv, +// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate +// operation. 1xNN notaion ia also removed and replaced with simpler NN. +// Swap will still have its own mnemonic and will be aliased as both +// left and right shuffles. + +// Shift elements right or left in 512 bit vector, filling with zeros. +// Multiple element shifts can be combined into a single larger +// element shift. + +#define mm512_shiftr_256( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 4 ) +#define mm512_shiftl_256( v ) mm512_shifr_256 + +#define mm512_shiftr_128( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 2 ) +#define mm512_shiftl_128( v ) \ + _mm512_alignr_epi64( v, _mm512_setzero, 6 ) + +#define mm512_shiftr_64( v ) \ + _mm512_alignr_epi64( _mm512_setzero, v, 1 ) +#define mm512_shiftl_64( v ) \ + _mm512_alignr_epi64( v, _mm512_setzero, 7 ) + +#define mm512_shiftr_32( v ) \ + _mm512_alignr_epi32( _mm512_setzero, v, 1 ) +#define mm512_shiftl_32( v ) \ + _mm512_alignr_epi32( v, _mm512_setzero, 15 ) + +// Shuffle-rotate elements left or right in 512 bit vector. static inline __m512i mm512_swap_256( const __m512i v ) { return _mm512_alignr_epi64( v, v, 4 ); } +#define mm512_shuflr_256( v ) mm512_swap_256 +#define mm512_shufll_256( v ) mm512_swap_256 -static inline __m512i mm512_ror_1x128( const __m512i v ) +static inline __m512i mm512_shuflr_128( const __m512i v ) { return _mm512_alignr_epi64( v, v, 2 ); } -static inline __m512i mm512_rol_1x128( const __m512i v ) +static inline __m512i mm512_shufll_128( const __m512i v ) { return _mm512_alignr_epi64( v, v, 6 ); } -static inline __m512i mm512_ror_1x64( const __m512i v ) +static inline __m512i mm512_shuflr_64( const __m512i v ) { return _mm512_alignr_epi64( v, v, 1 ); } -static inline __m512i mm512_rol_1x64( const __m512i v ) +static inline __m512i mm512_shufll_64( const __m512i v ) { return _mm512_alignr_epi64( v, v, 7 ); } -static inline __m512i mm512_ror_1x32( const __m512i v ) +static inline __m512i mm512_shuflr_32( const __m512i v ) { return _mm512_alignr_epi32( v, v, 1 ); } -static inline __m512i mm512_rol_1x32( const __m512i v ) +static inline __m512i mm512_shufll_32( const __m512i v ) { return _mm512_alignr_epi32( v, v, 15 ); } -static inline __m512i mm512_ror_x64( const __m512i v, const int n ) +// Generic +static inline __m512i mm512_shuflr_x64( const __m512i v, const int n ) { return _mm512_alignr_epi64( v, v, n ); } -static inline __m512i mm512_ror_x32( const __m512i v, const int n ) +static inline __m512i mm512_shufll_x32( const __m512i v, const int n ) { return _mm512_alignr_epi32( v, v, n ); } -#define mm512_ror_1x16( v ) \ +#define mm512_shuflr_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x0000001F001E001D, 0x001C001B001A0019, \ 0X0018001700160015, 0X0014001300120011, \ 0X0010000F000E000D, 0X000C000B000A0009, \ 0X0008000700060005, 0X0004000300020001 ), v ) -#define mm512_rol_1x16( v ) \ +#define mm512_shufll_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001E001D001C001B, 0x001A001900180017, \ 0X0016001500140013, 0X001200110010000F, \ 0X000E000D000C000B, 0X000A000900080007, \ 0X0006000500040003, 0X000200010000001F ), v ) -#define mm512_ror_1x8( v ) \ +#define mm512_shuflr_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x003F3E3D3C3B3A39, 0x3837363534333231, \ 0x302F2E2D2C2B2A29, 0x2827262524232221, \ 0x201F1E1D1C1B1A19. 0x1817161514131211, \ 0x100F0E0D0C0B0A09, 0x0807060504030201 ) ) -#define mm512_rol_1x8( v ) \ +#define mm512_shufll_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x3E3D3C3B3A393837, 0x363534333231302F. \ 0x2E2D2C2B2A292827, 0x262524232221201F, \ @@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n ) // // Rotate elements within 256 bit lanes of 512 bit vector. +// 128 bit lane shift is handled by bslli bsrli. // Swap hi & lo 128 bits in each 256 bit lane #define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) +#define mm512_shuflr256_128 mm512_swap256_128 +#define mm512_shufll256_128 mm512_swap256_128 // Rotate 256 bit lanes by one 64 bit element -#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 ) -#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 ) +#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) + +#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 ) // Rotate 256 bit lanes by one 32 bit element -#define mm512_ror256_32( v ) \ +#define mm512_shuflr256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x000000080000000f, 0x0000000e0000000d, \ 0x0000000c0000000b, 0x0000000a00000009, \ 0x0000000000000007, 0x0000000600000005, \ 0x0000000400000003, 0x0000000200000001 ), v ) -#define mm512_rol256_32( v ) \ +#define mm512_shufll256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x0000000e0000000d, 0x0000000c0000000b, \ 0x0000000a00000009, 0x000000080000000f, \ 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ), v ) -#define mm512_ror256_16( v ) \ +#define mm512_shuflr256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x00100001001e001d, 0x001c001b001a0019, \ 0x0018001700160015, 0x0014001300120011, \ 0x0000000f000e000d, 0x000c000b000a0009, \ 0x0008000700060005, 0x0004000300020001 ), v ) -#define mm512_rol256_16( v ) \ +#define mm512_shufll256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001e001d001c001b, 0x001a001900180017, \ 0x0016001500140013, 0x001200110010001f, \ 0x000e000d000c000b, 0x000a000900080007, \ 0x0006000500040003, 0x000200010000000f ), v ) -#define mm512_ror256_8( v ) \ +#define mm512_shuflr256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x203f3e3d3c3b3a39, 0x3837363534333231, \ 0x302f2e2d2c2b2a29, 0x2827262524232221, \ 0x001f1e1d1c1b1a19, 0x1817161514131211, \ 0x100f0e0d0c0b0a09, 0x0807060504030201 ) ) -#define mm512_rol256_8( v ) \ +#define mm512_shufll256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x3e3d3c3b3a393837, 0x363534333231302f, \ 0x2e2d2c2b2a292827, 0x262524232221203f, \ @@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n ) 0x0e0d0c0b0a090807, 0x060504030201001f ) ) // -// Rotate elements within 128 bit lanes of 512 bit vector. - +// Shuffle-roate elements within 128 bit lanes of 512 bit vector. + // Swap 64 bits in each 128 bit lane #define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) +#define mm512_shuflr128_64 mm512_swap128_64 +#define mm512_shufll128_64 mm512_swap128_64 // Rotate 128 bit lanes by one 32 bit element -#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) -#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) +#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) +#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) -// Rotate right 128 bit lanes by c bytes -static inline __m512i mm512_ror128_x8( const __m512i v, const int c ) +// Rotate right 128 bit lanes by c bytes, versatile and just as fast +static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) { return _mm512_alignr_epi8( v, v, c ); } -// Swap 32 bits in each 64 bit lane. +// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction +// but only with AVX512. Shuffle is just as fast and availble with AVX2 +// & SSE2. #define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) +#define mm512_shuflr64_32 mm512_swap64_32 +#define mm512_shufll64_32 mm512_swap64_32 - +// Need good way to distinguish 1 input shuffles, 2 input shuffle functions, +// and 2 input 2 output shuffle macros. // -// Rotate elements from 2 512 bit vectors in place, source arguments +// shuflr is 1 input +// shufl2r is 2 input ... +// Drop macros? They can easilly be rebuilt using shufl2 functions + +// add shuflr shufll functions performing rotate, returning first arg +// They're faster than doing both, when both not needed. + +// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return +// rotated v1 +// visually confusing for shif2r because of arg order. First arg is always +// the target for modification, either update by reference or by function +// return. +#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 ) +#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 ) + +#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 ) +#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 ) + +#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 ) +#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 ) + +#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 ) +#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 ) + +// Rotate elements from 2 512 bit vectors in place, source arguments // are overwritten. #define mm512_swap1024_512( v1, v2 ) \ v1 = _mm512_xor_si512( v1, v2 ); \ v2 = _mm512_xor_si512( v1, v2 ); \ v1 = _mm512_xor_si512( v1, v2 ); +#define mm512_shufl2l_512 mm512_swap1024_512 \ +#define mm512_shufl2r_512 mm512_swap1024_512 \ -#define mm512_ror1024_256( v1, v2 ) \ +// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is +// for now. +// Rotate elements from 2 512 bit vectors in place, both source arguments +// are updated. + +#define mm512_vror1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v1 = _mm512_alignr_epi64( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm512_rol1024_256( v1, v2 ) \ +#define mm512_vrol1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v2 = _mm512_alignr_epi64( v2, v1, 4 ); \ v1 = t; \ } while(0) -#define mm512_ror1024_128( v1, v2 ) \ +#define mm512_vror1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \ v1 = _mm512_alignr_epi64( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm512_rol1024_128( v1, v2 ) \ +#define mm512_vrol1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \ v2 = _mm512_alignr_epi64( v2, v1, 6 ); \ v1 = t; \ } while(0) -#define mm512_ror1024_64( v1, v2 ) \ +#define mm512_vror1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \ v1 = _mm512_alignr_epi64( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1024_64( v1, v2 ) \ +#define mm512_vrol1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \ v2 = _mm512_alignr_epi64( v2, v1, 7 ); \ v1 = t; \ } while(0) -#define mm512_ror1024_32( v1, v2 ) \ +#define mm512_vror1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \ v1 = _mm512_alignr_epi32( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1024_32( v1, v2 ) \ +#define mm512_vrol1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \ v2 = _mm512_alignr_epi32( v2, v1, 15 ); \ diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index e74066b..31b0b89 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -68,13 +68,13 @@ // rotation. // Swap hi & lo 32 bits. -#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e ) +#define mm64_swap_32( a ) _mm_shuffle_pi16( a, 0x4e ) -#define mm64_ror64_1x16( a ) _mm_shuffle_pi16( a, 0x39 ) -#define mm64_rol64_1x16( a ) _mm_shuffle_pi16( a, 0x93 ) +#define mm64_shulfr_16( a ) _mm_shuffle_pi16( a, 0x39 ) +#define mm64_shufll_16( a ) _mm_shuffle_pi16( a, 0x93 ) // Swap hi & lo 16 bits of each 32 bit element -#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 ) +#define mm64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 ) #if defined(__SSSE3__) @@ -86,7 +86,7 @@ _mm_shuffle_pi8( v, (__m64)0x0607040502030001 ); // Rotate right by c bytes -static inline __m64 mm64_ror_x8( __m64 v, const int c ) +static inline __m64 mm64_vror_x8( __m64 v, const int c ) { return _mm_alignr_pi8( v, v, c ); } #else diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 4a7188e..601c750 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -5,10 +5,19 @@ #define bswap_64( a ) __builtin_bswap64( a ) #define bswap_32( a ) __builtin_bswap32( a ) -// safe division, integer or floating point +// Safe division, integer or floating point. For floating point it's as +// safe as 0. is precisely zero. +// Returns safe_result if division by zero. #define safe_div( dividend, divisor, safe_result ) \ ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) ) +// Aliases with familiar names for built in bit rotate instructions +#define rol64( a, n ) _lrotl( a, n ) +#define ror64( a, n ) _lrotr( a, n ) +#define rol32( a, n ) _rotl( a, n ) +#define ror32( a, n ) _rotr( a, n ) +#define rol16( a, n ) _rotwl( a, n ) +#define ror16( a, n ) _rotwr( a, n ) /////////////////////////////////////// // @@ -29,12 +38,14 @@ // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 ); // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 ); +// obsolete test // Compiler check for __int128 support // Configure also has a test for int128. #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) #define GCC_INT128 1 #endif +// obsolte test #if !defined(GCC_INT128) #warning "__int128 not supported, requires GCC-4.8 or newer." #endif diff --git a/sysinfos.c b/sysinfos.c index 010c78f..ed453e2 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz) for (int i = 2; i <= (ext & 0xF); i++) { cpuid(0x80000000+i, output); - memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int)); + memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int)); } snprintf(outbuf, maxsz, "%s", brand); } diff --git a/util.c b/util.c index b96c4fe..31b9270 100644 --- a/util.c +++ b/util.c @@ -47,6 +47,7 @@ //#include "miner.h" #include "elist.h" #include "algo-gate-api.h" +#include "algo/sha/sha256d.h" //extern pthread_mutex_t stats_lock; @@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... ) // localtime_r(&now, &tm); - switch (prio) { + switch ( prio ) + { + case LOG_CRIT: color = CL_LRD; break; case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; + case LOG_WARNING: color = CL_YL2; break; + case LOG_MAJR: color = CL_YL2; break; case LOG_NOTICE: color = CL_WHT; break; case LOG_INFO: color = ""; break; case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; + case LOG_MINR: color = CL_YLW; break; + case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break; + case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break; + case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break; } if (!use_colors) color = ""; @@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...) localtime_r(&now, &tm); - switch (prio) { - case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; + switch ( prio ) + { + case LOG_CRIT: color = CL_LRD; break; + case LOG_ERR: color = CL_RED; break; + case LOG_WARNING: color = CL_YL2; break; + case LOG_MAJR: color = CL_YL2; break; case LOG_NOTICE: color = CL_WHT; break; - case LOG_INFO: color = ""; break; + case LOG_INFO: color = ""; break; case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; + case LOG_MINR: color = CL_YLW; break; + case LOG_GREEN: color = CL_GRN; prio = LOG_INFO; break; + case LOG_BLUE: color = CL_CYN; prio = LOG_NOTICE; break; + case LOG_PINK: color = CL_LMA; prio = LOG_NOTICE; break; } if (!use_colors) color = ""; @@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output) ); } +// For use with MiB etc +void format_number_si( double* n, char* si_units ) +{ + if ( *n < 1024*10 ) { *si_units = 0; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'k'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'M'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'G'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'T'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'P'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'E'; return; } + *n /= 1024; + if ( *n < 1024*10 ) { *si_units = 'Z'; return; } + *n /= 1024; + *si_units = 'Y'; +} + + /* Modify the representation of integer numbers which would cause an overflow * so that they are treated as floating-point numbers. * This is a hack to overcome the limitations of some versions of Jansson. */