diff --git a/Makefile.am b/Makefile.am index d5398c0..a4adc3b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -163,6 +163,8 @@ cpuminer_SOURCES = \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ algo/sha/sha512-hash-4way.c \ + algo/sha/sha256-hash-opt.c \ + algo/sha/sha256-hash-2way-ni.c \ algo/sha/hmac-sha256-hash.c \ algo/sha/hmac-sha256-hash-4way.c \ algo/sha/sha2.c \ diff --git a/README.txt b/README.txt index 08c34b9..22428ec 100644 --- a/README.txt +++ b/README.txt @@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact procedure is documented in the build instructions for Windows: https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source +Some DLL filess may already be installed on the system by Windows or third +party packages. They often will work and may be used instead of the included +file. Without a compelling reason to do so it's recommended to use the included +files as they are packaged. + If you like this software feel free to donate: BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT diff --git a/RELEASE_NOTES b/RELEASE_NOTES index a500a8d..cdacd32 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,13 @@ If not what makes it happen or not happen? Change Log ---------- +v3.17.0 + +AVX512 optimized using ternary logic instructions. +Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%. +Use SHA on supported CPUs to produce merkle hash. +Fixed byte order in Extranonce2 log & replaced Block height with Job ID. + v3.16.5 #329: Fixed GBT incorrect target diff in stats, second attempt. diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index ae97b94..4778914 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -1293,32 +1293,26 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); #define DH1L( m, sl, sr, a, b, c ) \ - _mm512_add_epi64( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \ - _mm512_srli_epi64( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \ + _mm512_srli_epi64( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH1R( m, sl, sr, a, b, c ) \ - _mm512_add_epi64( \ - _mm512_xor_si512( M[m], \ - _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \ - _mm512_slli_epi64( qt[a], sr ) ) ), \ - _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) ) + _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \ + _mm512_slli_epi64( qt[a], sr ) ), \ + mm512_xor3( xl, qt[b], qt[c] ) ) #define DH2L( m, rl, sl, h, a, b, c ) \ _mm512_add_epi64( _mm512_add_epi64( \ - mm512_rol_64( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); - + mm512_rol_64( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) ) + #define DH2R( m, rl, sr, h, a, b, c ) \ _mm512_add_epi64( _mm512_add_epi64( \ - mm512_rol_64( dH[h], rl ), \ - _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \ - _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \ - _mm512_xor_si512( qt[b], qt[c] ) ) ); + mm512_rol_64( dH[h], rl ), \ + mm512_xor3( xh, qt[a], M[m] ) ), \ + mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) ) dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 2a952a7..9944ebe 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -560,22 +560,14 @@ do { \ __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \ dm = mm512_negate_32( _mm512_or_si512( dm, \ _mm512_slli_epi64( dm, 32 ) ) ); \ - m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \ - m512_const1_64( tp[0] ) ) ); \ - m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \ - m512_const1_64( tp[1] ) ) ); \ - m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \ - m512_const1_64( tp[2] ) ) ); \ - m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \ - m512_const1_64( tp[3] ) ) ); \ - m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \ - m512_const1_64( tp[4] ) ) ); \ - m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \ - m512_const1_64( tp[5] ) ) ); \ - m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \ - m512_const1_64( tp[6] ) ) ); \ - m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \ - m512_const1_64( tp[7] ) ) ); \ + m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \ + m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \ + m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \ + m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \ + m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \ + m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \ + m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \ + m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \ tp += 8; \ db = _mm512_srli_epi64( db, 1 ); \ } \ @@ -585,20 +577,13 @@ do { \ do { \ __m512i t; \ t = a; \ - a = _mm512_and_si512( a, c ); \ - a = _mm512_xor_si512( a, d ); \ - c = _mm512_xor_si512( c, b ); \ - c = _mm512_xor_si512( c, a ); \ - d = _mm512_or_si512( d, t ); \ - d = _mm512_xor_si512( d, b ); \ + a = mm512_xorand( d, a, c ); \ + c = mm512_xor3( a, b, c ); \ + b = mm512_xoror( b, d, t ); \ t = _mm512_xor_si512( t, c ); \ - b = d; \ - d = _mm512_or_si512( d, t ); \ - d = _mm512_xor_si512( d, a ); \ - a = _mm512_and_si512( a, b ); \ - t = _mm512_xor_si512( t, a ); \ - b = _mm512_xor_si512( b, d ); \ - b = _mm512_xor_si512( b, t ); \ + d = mm512_xoror( a, b, t ); \ + t = mm512_xorand( t, a, b ); \ + b = mm512_xor3( b, d, t ); \ a = c; \ c = b; \ b = d; \ @@ -609,14 +594,12 @@ do { \ do { \ a = mm512_rol_32( a, 13 ); \ c = mm512_rol_32( c, 3 ); \ - b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \ - d = _mm512_xor_si512( d, _mm512_xor_si512( c, \ - _mm512_slli_epi32( a, 3 ) ) ); \ + b = mm512_xor3( a, b, c ); \ + d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \ b = mm512_rol_32( b, 1 ); \ d = mm512_rol_32( d, 7 ); \ - a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \ - c = _mm512_xor_si512( c, _mm512_xor_si512( d, \ - _mm512_slli_epi32( b, 7 ) ) ); \ + a = mm512_xor3( a, b, d ); \ + c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \ a = mm512_rol_32( a, 5 ); \ c = mm512_rol_32( c, 22 ); \ } while (0) diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index 6b45e10..20c9755 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -522,50 +522,53 @@ do { \ // Haval-256 8 way 32 bit avx2 +#if defined (__AVX512VL__) + +// ( ~( a ^ b ) ) & c +#define mm256_andnotxor( a, b, c ) \ + _mm256_ternarylogic_epi32( a, b, c, 0x82 ) + +#else + +#define mm256_andnotxor( a, b, c ) \ + _mm256_andnot_si256( _mm256_xor_si256( a, b ), c ) + +#endif + #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( x0, \ - _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \ - _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ - _mm256_and_si256( x3, x6 ) ) ) ) \ + mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \ + _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ + _mm256_and_si256( x3, x6 ) ) ) \ #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_and_si256( x2, \ - _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \ - _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \ - _mm256_xor_si256( x6, x0 ) ) ) ), \ - _mm256_xor_si256( \ - _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \ - _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \ + mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \ + mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \ + mm256_andxor( x4, x1, x5 ), \ + mm256_xorand( x0, x3, x5 ) ) \ #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_and_si256( x3, \ - _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ - _mm256_xor_si256( x6, x0 ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \ - _mm256_and_si256( x2, x5 ) ), x0 ) ) + mm256_xor3( x0, \ + _mm256_and_si256( x3, \ + mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \ + _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ) ) ) #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \ - _mm256_xor_si256( \ - _mm256_xor_si256( \ - _mm256_and_si256( x3, \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ - _mm256_or_si256( x4, x6 ) ), x5 ) ), \ - _mm256_and_si256( x4, \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \ - _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \ - _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) ) - + mm256_xor3( \ + mm256_andxor( x3, x5, \ + _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ + _mm256_or_si256( x4, x6 ) ) ), \ + _mm256_and_si256( x4, \ + mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \ + _mm256_xor_si256( x1, x6 ) ) ), \ + mm256_xorand( x0, x2, x6 ) ) #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \ _mm256_xor_si256( \ - _mm256_and_si256( x0, \ - mm256_not( _mm256_xor_si256( \ - _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \ - _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ - _mm256_and_si256( x2, x5 ) ), \ - _mm256_and_si256( x3, x6 ) ) ) + mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \ + mm256_xor3( _mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ), \ + _mm256_and_si256( x3, x6 ) ) ) #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \ F1_8W(x1, x0, x3, x5, x6, x2, x4) diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index 452bc8a..98a9da0 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -51,15 +51,15 @@ extern "C"{ do { \ __m512i cc = _mm512_set1_epi64( c ); \ x3 = mm512_not( x3 ); \ - x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \ - tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \ - x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \ - x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \ - x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \ - x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \ - x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \ - x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \ - x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \ + x0 = mm512_xorandnot( x0, x2, cc ); \ + tmp = mm512_xorand( cc, x0, x1 ); \ + x0 = mm512_xorand( x0, x2, x3 ); \ + x3 = mm512_xorandnot( x3, x1, x2 ); \ + x1 = mm512_xorand( x1, x0, x2 ); \ + x2 = mm512_xorandnot( x2, x3, x0 ); \ + x0 = mm512_xoror( x0, x1, x3 ); \ + x3 = mm512_xorand( x3, x1, x2 ); \ + x1 = mm512_xorand( x1, tmp, x0 ); \ x2 = _mm512_xor_si512( x2, tmp ); \ } while (0) @@ -67,11 +67,11 @@ do { \ do { \ x4 = _mm512_xor_si512( x4, x1 ); \ x5 = _mm512_xor_si512( x5, x2 ); \ - x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \ + x6 = mm512_xor3( x6, x3, x0 ); \ x7 = _mm512_xor_si512( x7, x0 ); \ x0 = _mm512_xor_si512( x0, x5 ); \ x1 = _mm512_xor_si512( x1, x6 ); \ - x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \ + x2 = mm512_xor3( x2, x7, x4 ); \ x3 = _mm512_xor_si512( x3, x4 ); \ } while (0) @@ -318,12 +318,12 @@ static const sph_u64 C[] = { #define Wz_8W(x, c, n) \ do { \ __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \ - x ## h = _mm512_or_si512( _mm512_and_si512( \ - _mm512_srli_epi64(x ## h, (n)), (c)), t ); \ + x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \ t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \ - x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \ + x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \ } while (0) + #define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 ) #define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 ) #define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 ) diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index cc88332..e2545b4 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -76,6 +76,9 @@ static const uint64_t RC[] = { #define OR64(d, a, b) (d = _mm512_or_si512(a,b)) #define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) #define ROL64(d, v, n) (d = mm512_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) +#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) + #include "keccak-macros.c" @@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst) #undef NOT64 #undef ROL64 #undef KECCAK_F_1600 +#undef XOROR +#undef XORAND #endif // AVX512 @@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) +#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c))) #include "keccak-macros.c" @@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst) #undef NOT64 #undef ROL64 #undef KECCAK_F_1600 +#undef XOROR +#undef XORAND #endif // AVX2 diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c index 8d5197c..436d1ca 100644 --- a/algo/keccak/keccak-macros.c +++ b/algo/keccak/keccak-macros.c @@ -110,20 +110,34 @@ #ifdef KHI_XO #undef KHI_XO #endif + +#define KHI_XO(d, a, b, c) do { \ + XOROR(d, a, b, c); \ + } while (0) + +/* #define KHI_XO(d, a, b, c) do { \ DECL64(kt); \ OR64(kt, b, c); \ XOR64(d, a, kt); \ } while (0) +*/ #ifdef KHI_XA #undef KHI_XA #endif + +#define KHI_XA(d, a, b, c) do { \ + XORAND(d, a, b, c); \ + } while (0) + +/* #define KHI_XA(d, a, b, c) do { \ DECL64(kt); \ AND64(kt, b, c); \ XOR64(d, a, kt); \ } while (0) +*/ #ifdef KHI #undef KHI diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index bbc31b9..3d1ce0d 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -97,6 +97,21 @@ do { \ MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\ ADD_CONSTANT4W(*x, *(x+4), c0, c1); +#define SUBCRUMB4W(a0,a1,a2,a3,t)\ + t = a0;\ + a0 = mm512_xoror( a3, a0, a1 ); \ + a2 = _mm512_xor_si512(a2,a3);\ + a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ + a3 = mm512_xorand( a2, a3, t ); \ + a2 = mm512_xorand( a1, a2, a0);\ + a1 = _mm512_or_si512(a1,a3);\ + a3 = _mm512_xor_si512(a3,a2);\ + t = _mm512_xor_si512(t,a1);\ + a2 = _mm512_and_si512(a2,a1);\ + a1 = mm512_xnor(a1,a0);\ + a0 = t; + +/* #define SUBCRUMB4W(a0,a1,a2,a3,t)\ t = _mm512_load_si512(&a0);\ a0 = _mm512_or_si512(a0,a1);\ @@ -115,7 +130,25 @@ do { \ a2 = _mm512_and_si512(a2,a1);\ a1 = _mm512_xor_si512(a1,a0);\ a0 = _mm512_load_si512(&t); +*/ +#define MIXWORD4W(a,b,t1,t2)\ + b = _mm512_xor_si512(a,b);\ + t1 = _mm512_slli_epi32(a,2);\ + t2 = _mm512_srli_epi32(a,30);\ + a = mm512_xoror( b, t1, t2 ); \ + t1 = _mm512_slli_epi32(b,14);\ + t2 = _mm512_srli_epi32(b,18);\ + b = _mm512_or_si512(t1,t2);\ + b = mm512_xoror( a, t1, t2 ); \ + t1 = _mm512_slli_epi32(a,10);\ + t2 = _mm512_srli_epi32(a,22);\ + a = mm512_xoror( b, t1, t2 ); \ + t1 = _mm512_slli_epi32(b,1);\ + t2 = _mm512_srli_epi32(b,31);\ + b = _mm512_or_si512(t1,t2); + +/* #define MIXWORD4W(a,b,t1,t2)\ b = _mm512_xor_si512(a,b);\ t1 = _mm512_slli_epi32(a,2);\ @@ -133,6 +166,7 @@ do { \ t1 = _mm512_slli_epi32(b,1);\ t2 = _mm512_srli_epi32(b,31);\ b = _mm512_or_si512(t1,t2); +*/ #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ a1 = _mm512_shuffle_epi32(a1,147);\ @@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg ) __m512i tmp[2]; __m512i x[8]; - t0 = chainv[0]; - t1 = chainv[1]; - - t0 = _mm512_xor_si512( t0, chainv[2] ); - t1 = _mm512_xor_si512( t1, chainv[3] ); - t0 = _mm512_xor_si512( t0, chainv[4] ); - t1 = _mm512_xor_si512( t1, chainv[5] ); - t0 = _mm512_xor_si512( t0, chainv[6] ); - t1 = _mm512_xor_si512( t1, chainv[7] ); - t0 = _mm512_xor_si512( t0, chainv[8] ); - t1 = _mm512_xor_si512( t1, chainv[9] ); + t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] ); + t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] ); + t0 = mm512_xor3( t0, chainv[6], chainv[8] ); + t1 = mm512_xor3( t1, chainv[7], chainv[9] ); MULT24W( t0, t1 ); @@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg ) chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] ); MULT24W( chainv[0], chainv[1] ); - chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 ); - chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 ); + chainv[0] = mm512_xor3( chainv[0], t0, msg0 ); + chainv[1] = mm512_xor3( chainv[1], t1, msg1 ); MULT24W( msg0, msg1 ); chainv[2] = _mm512_xor_si512( chainv[2], msg0 ); @@ -398,19 +425,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b ) /*---- blank round with m=0 ----*/ rnd512_4way( state, zero ); - - t[0] = chainv[0]; - t[1] = chainv[1]; - - t[0] = _mm512_xor_si512( t[0], chainv[2] ); - t[1] = _mm512_xor_si512( t[1], chainv[3] ); - t[0] = _mm512_xor_si512( t[0], chainv[4] ); - t[1] = _mm512_xor_si512( t[1], chainv[5] ); - t[0] = _mm512_xor_si512( t[0], chainv[6] ); - t[1] = _mm512_xor_si512( t[1], chainv[7] ); - t[0] = _mm512_xor_si512( t[0], chainv[8] ); - t[1] = _mm512_xor_si512( t[1], chainv[9] ); - + + t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] ); + t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] ); + t[0] = mm512_xor3( t[0], chainv[6], chainv[8] ); + t[1] = mm512_xor3( t[1], chainv[7], chainv[9] ); t[0] = _mm512_shuffle_epi32( t[0], 27 ); t[1] = _mm512_shuffle_epi32( t[1], 27 ); @@ -676,8 +695,6 @@ do { \ a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ } while(0) -// confirm pointer arithmetic -// ok but use array indexes #define STEP_PART(x,c0,c1,t)\ SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ @@ -688,23 +705,23 @@ do { \ ADD_CONSTANT(*x, *(x+4), c0, c1); #define SUBCRUMB(a0,a1,a2,a3,t)\ - t = _mm256_load_si256(&a0);\ + t = a0;\ a0 = _mm256_or_si256(a0,a1);\ a2 = _mm256_xor_si256(a2,a3);\ - a1 = _mm256_andnot_si256(a1, m256_neg1 );\ + a1 = mm256_not( a1 );\ a0 = _mm256_xor_si256(a0,a3);\ a3 = _mm256_and_si256(a3,t);\ a1 = _mm256_xor_si256(a1,a3);\ a3 = _mm256_xor_si256(a3,a2);\ a2 = _mm256_and_si256(a2,a0);\ - a0 = _mm256_andnot_si256(a0, m256_neg1 );\ + a0 = mm256_not( a0 );\ a2 = _mm256_xor_si256(a2,a1);\ a1 = _mm256_or_si256(a1,a3);\ t = _mm256_xor_si256(t,a1);\ a3 = _mm256_xor_si256(a3,a2);\ a2 = _mm256_and_si256(a2,a1);\ a1 = _mm256_xor_si256(a1,a0);\ - a0 = _mm256_load_si256(&t);\ + a0 = t;\ #define MIXWORD(a,b,t1,t2)\ b = _mm256_xor_si256(a,b);\ diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c index d0bc186..912fb2e 100644 --- a/algo/panama/panama-hash-4way.c +++ b/algo/panama/panama-hash-4way.c @@ -312,10 +312,26 @@ do { \ BUPDATE1_8W( 7, 1 ); \ } while (0) +#if defined(__AVX512VL__) + +#define GAMMA_8W(n0, n1, n2, n4) \ + ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) ) + +#define THETA_8W(n0, n1, n2, n4) \ + ( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) ) + +#else + #define GAMMA_8W(n0, n1, n2, n4) \ (g ## n0 = _mm256_xor_si256( a ## n0, \ _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) ) +#define THETA_8W(n0, n1, n2, n4) \ + ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \ + a ## n4 ) ) ) + +#endif + #define PI_ALL_8W do { \ a0 = g0; \ a1 = mm256_rol_32( g7, 1 ); \ @@ -336,9 +352,6 @@ do { \ a16 = mm256_rol_32( g10, 8 ); \ } while (0) -#define THETA_8W(n0, n1, n2, n4) \ - ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \ - a ## n4 ) ) ) #define SIGMA_ALL_8W do { \ a0 = _mm256_xor_si256( g0, m256_one_32 ); \ diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 3a0c61b..f9505d1 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); void sha256_4way_full( void *dst, const void *data, size_t len ); +void sha256_4way_transform( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); #endif // SSE2 @@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc ); void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); void sha256_8way_close( sha256_8way_context *sc, void *dst ); void sha256_8way_full( void *dst, const void *data, size_t len ); +void sha256_8way_transform( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); #endif // AVX2 @@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc ); void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); void sha256_16way_close( sha256_16way_context *sc, void *dst ); void sha256_16way_full( void *dst, const void *data, size_t len ); +void sha256_16way_transform( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); +void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, + const __m512i *state_in ); +void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const __m512i *state_mid ); #endif // AVX512 diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 33cc6c1..7eb4067 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) hash[i] = swab32(hash[i]); } -extern void sha256d(unsigned char *hash, const unsigned char *data, int len) +#if defined (__SHA__) + +#include "algo/sha/sph_sha2.h" + +void sha256d(unsigned char *hash, const unsigned char *data, int len) { + sph_sha256_context ctx __attribute__ ((aligned (64))); + + sph_sha256_init( &ctx ); + sph_sha256( &ctx, data, len ); + sph_sha256_close( &ctx, hash ); + + sph_sha256_init( &ctx ); + sph_sha256( &ctx, hash, 32 ); + sph_sha256_close( &ctx, hash ); +} + +#else + +void sha256d(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; int i, r; @@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len) be32enc((uint32_t *)hash + i, T[i]); } +#endif + static inline void sha256d_preextend(uint32_t *W) { W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c new file mode 100644 index 0000000..f169b63 --- /dev/null +++ b/algo/sha/sha256-hash-2way-ni.c @@ -0,0 +1,345 @@ +/* Intel SHA extensions using C intrinsics */ +/* Written and place in public domain by Jeffrey Walton */ +/* Based on code from Intel, and by Sean Gulley for */ +/* the miTLS project. */ + +// A stripped down version with byte swapping removed. + +#if defined(__SHA__) + +#include "sha256-hash-opt.h" + +void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; + __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; + + // Load initial values + TMP_X = _mm_load_si128((__m128i*) &in_X[0]); + STATE1_X = _mm_load_si128((__m128i*) &in_X[4]); + TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]); + STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]); + + TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB + TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH + STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF + STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF + STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH + STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE_X = STATE0_X; + ABEF_SAVE_Y = STATE0_Y; + CDGH_SAVE_X = STATE1_X; + CDGH_SAVE_Y = STATE1_Y; + + // Rounds 0-3 + TMSG0_X = _mm_load_si128((const __m128i*) (msg_X)); + TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y)); + TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL); + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 4-7 + TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16)); + TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16)); + TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 8-11 + TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32)); + TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32)); + TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 12-15 + TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48)); + TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48)); + TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 16-19 + TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 20-23 + TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 24-27 + TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 28-31 + TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 32-35 + TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 36-39 + TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X); + TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y); + + // Rounds 40-43 + TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X); + TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y); + + // Rounds 44-47 + TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4); + TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X); + TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y); + TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X); + TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X); + TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y); + + // Rounds 48-51 + TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL); + MSG_X = _mm_add_epi32(TMSG0_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4); + TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X); + TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y); + TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X); + TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X); + TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y); + + // Rounds 52-55 + TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL); + MSG_X = _mm_add_epi32(TMSG1_X, TMP_X ); + MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X ); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4); + TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X); + TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y); + TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X); + TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 56-59 + TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL); + MSG_X = _mm_add_epi32(TMSG2_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4); + TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4); + TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X); + TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y); + TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X); + TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Rounds 60-63 + TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL); + MSG_X = _mm_add_epi32(TMSG3_X, TMP_X); + MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X); + STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X); + STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y); + MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E); + MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E); + STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X); + STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y); + + // Add values back to state + STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X); + STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X); + STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y); + STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y); + + TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA + TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA + STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG + STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG + STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA + STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA + STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF + STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &out_X[0], STATE0_X); + _mm_store_si128((__m128i*) &out_X[4], STATE1_X); + _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y); + _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y); +} + +#endif diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index a1f657e..c5f6048 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -79,10 +79,15 @@ static const uint32_t K256[64] = _mm_or_si128( _mm_and_si128( X, Y ), \ _mm_and_si128( _mm_or_si128( X, Y ), Z ) ) */ - +/* #define MAJs(X, Y, Z) \ _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \ _mm_xor_si128( Y, Z ) ) ) +*/ + +#define MAJs(X, Y, Z) \ + _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \ + Y_xor_Z ) ) #define BSG2_0(x) \ _mm_xor_si128( _mm_xor_si128( \ @@ -100,6 +105,7 @@ static const uint32_t K256[64] = _mm_xor_si128( _mm_xor_si128( \ mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) +/* #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \ @@ -128,9 +134,9 @@ do { \ H = _mm_add_epi32( T1, T2 ); \ D = _mm_add_epi32( D, T1 ); \ } while (0) +*/ -/* #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m128i T1, T2; \ @@ -138,16 +144,98 @@ do { \ T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ K, W[i] ) ); \ T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ + Y_xor_Z = X_xor_Y; \ D = _mm_add_epi32( D, T1 ); \ H = _mm_add_epi32( T1, T2 ); \ } while (0) -*/ +void sha256_4way_transform( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + __m128i W[16]; + + memcpy_128( W, data, 16 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + Y_xor_Z = _mm_xor_si128( B, C ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm_add_epi32( state_in[0], A ); + state_out[1] = _mm_add_epi32( state_in[1], B ); + state_out[2] = _mm_add_epi32( state_in[2], C ); + state_out[3] = _mm_add_epi32( state_in[3], D ); + state_out[4] = _mm_add_epi32( state_in[4], E ); + state_out[5] = _mm_add_epi32( state_in[5], F ); + state_out[6] = _mm_add_epi32( state_in[6], G ); + state_out[7] = _mm_add_epi32( state_in[7], H ); +} + static void sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) { - register __m128i A, B, C, D, E, F, G, H; + register __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m128i W[16]; mm128_block_bswap_32( W, in ); @@ -176,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) H = m128_const1_64( 0x5BE0CD195BE0CD19 ); } + Y_xor_Z = _mm_xor_si128( B, C ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); @@ -327,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm128_bswap_32( m128_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm128_bswap_32( m128_const1_32( low ) ); + sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) ); + sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) ); sha256_4way_round( sc, sc->buf, sc->val ); mm128_block_bswap_32( dst, sc->val ); @@ -348,23 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) // SHA-256 8 way +#if defined(__AVX512VL__) + +#define CHx(X, Y, Z) \ + _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) + +#define MAJx(X, Y, Z) \ + _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 ) + +#define BSG2_0x(x) \ + mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) ) + +#define BSG2_1x(x) \ + mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) ) + +#define SSG2_0x(x) \ + mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) ) + +#define SSG2_1x(x) \ + mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) ) + +#else // AVX2 + #define CHx(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) -/* -#define MAJx(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) -*/ -/* #define MAJx(X, Y, Z) \ _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \ _mm256_xor_si256( Y, Z ) ) ) -*/ - +/* #define MAJx(X, Y, Z) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) +*/ #define BSG2_0x(x) \ _mm256_xor_si256( _mm256_xor_si256( \ @@ -382,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) _mm256_xor_si256( _mm256_xor_si256( \ mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) ) +#endif // AVX512 else AVX2 + #define SHA2x_MEXP( a, b, c, d ) \ mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); @@ -392,15 +498,95 @@ do { \ T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ K, W[i] ) ); \ T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ - Y_xor_Z = X_xor_Y; \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) -static void -sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) +void sha256_8way_transform( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) { - register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + __m256i A, B, C, D, E, F, G, H; + __m256i W[16]; + + memcpy_256( W, data, 16 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm256_add_epi32( state_in[0], A ); + state_out[1] = _mm256_add_epi32( state_in[1], B ); + state_out[2] = _mm256_add_epi32( state_in[2], C ); + state_out[3] = _mm256_add_epi32( state_in[3], D ); + state_out[4] = _mm256_add_epi32( state_in[4], E ); + state_out[5] = _mm256_add_epi32( state_in[5], F ); + state_out[6] = _mm256_add_epi32( state_in[6], G ); + state_out[7] = _mm256_add_epi32( state_in[7], H ); +} + +static void +sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) +{ + register __m256i A, B, C, D, E, F, G, H; __m256i W[16]; mm256_block_bswap_32( W , in ); @@ -429,8 +615,6 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) H = m256_const1_64( 0x5BE0CD195BE0CD19 ); } - Y_xor_Z = _mm256_xor_si256( B, C ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); @@ -586,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm256_bswap_32( m256_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm256_bswap_32( m256_const1_32( low ) ); + sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) ); + sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) ); sha256_8way_round( sc, sc->buf, sc->val ); @@ -609,38 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) // SHA-256 16 way #define CHx16(X, Y, Z) \ - _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) - -/* -#define MAJx16(X, Y, Z) \ - _mm512_or_si512( _mm512_and_si512( X, Y ), \ - _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) -*/ -/* -#define MAJx16(X, Y, Z) \ - _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \ - _mm512_xor_si512( Y, Z ) ) ) -*/ + _mm512_ternarylogic_epi32( X, Y, Z, 0xca ) #define MAJx16(X, Y, Z) \ - _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \ - Y_xor_Z ) ) + _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 ) #define BSG2_0x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) ) + mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) ) #define BSG2_1x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) ) + mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) ) #define SSG2_0x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) + mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) ) #define SSG2_1x16(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) ) + mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) ) #define SHA2x16_MEXP( a, b, c, d ) \ mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); @@ -652,15 +818,220 @@ do { \ T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \ K, W[i] ) ); \ T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ - Y_xor_Z = X_xor_Y; \ D = _mm512_add_epi32( D, T1 ); \ H = _mm512_add_epi32( T1, T2 ); \ } while (0) +// Tranform one 16 lane by 64 byte message block and update state. +// Calling function is responsible for initializing the state, setting +// correct byte order, counting bits and padding of the final block. +// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating +// redundant byte swapping. +// +void sha256_16way_transform( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; + + memcpy_512( W, data, 16 ); + + A = state_in[0]; + B = state_in[1]; + C = state_in[2]; + D = state_in[3]; + E = state_in[4]; + F = state_in[5]; + G = state_in[6]; + H = state_in[7]; + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + state_out[0] = _mm512_add_epi32( state_in[0], A ); + state_out[1] = _mm512_add_epi32( state_in[1], B ); + state_out[2] = _mm512_add_epi32( state_in[2], C ); + state_out[3] = _mm512_add_epi32( state_in[3], D ); + state_out[4] = _mm512_add_epi32( state_in[4], E ); + state_out[5] = _mm512_add_epi32( state_in[5], F ); + state_out[6] = _mm512_add_epi32( state_in[6], G ); + state_out[7] = _mm512_add_epi32( state_in[7], H ); +} + +// Aggresive prehashing +void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, + const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + + A = _mm512_load_si512( state_in ); + B = _mm512_load_si512( state_in + 1 ); + C = _mm512_load_si512( state_in + 2 ); + D = _mm512_load_si512( state_in + 3 ); + E = _mm512_load_si512( state_in + 4 ); + F = _mm512_load_si512( state_in + 5 ); + G = _mm512_load_si512( state_in + 6 ); + H = _mm512_load_si512( state_in + 7 ); + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + + _mm512_store_si512( state_mid , A ); + _mm512_store_si512( state_mid + 1, B ); + _mm512_store_si512( state_mid + 2, C ); + _mm512_store_si512( state_mid + 3, D ); + _mm512_store_si512( state_mid + 4, E ); + _mm512_store_si512( state_mid + 5, F ); + _mm512_store_si512( state_mid + 6, G ); + _mm512_store_si512( state_mid + 7, H ); +} + +void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, + const __m512i *state_in, const __m512i *state_mid ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; + + memcpy_512( W, data, 16 ); + + A = _mm512_load_si512( state_mid ); + B = _mm512_load_si512( state_mid + 1 ); + C = _mm512_load_si512( state_mid + 2 ); + D = _mm512_load_si512( state_mid + 3 ); + E = _mm512_load_si512( state_mid + 4 ); + F = _mm512_load_si512( state_mid + 5 ); + G = _mm512_load_si512( state_mid + 6 ); + H = _mm512_load_si512( state_mid + 7 ); + +// SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); +// SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); +// SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + A = _mm512_add_epi32( A, _mm512_load_si512( state_in ) ); + B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) ); + C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) ); + D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) ); + E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) ); + F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) ); + G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) ); + H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) ); + + _mm512_store_si512( state_out , A ); + _mm512_store_si512( state_out + 1, B ); + _mm512_store_si512( state_out + 2, C ); + _mm512_store_si512( state_out + 3, D ); + _mm512_store_si512( state_out + 4, E ); + _mm512_store_si512( state_out + 5, F ); + _mm512_store_si512( state_out + 6, G ); + _mm512_store_si512( state_out + 7, H ); +} + static void sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) { - register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + register __m512i A, B, C, D, E, F, G, H; __m512i W[16]; mm512_block_bswap_32( W , in ); @@ -689,7 +1060,6 @@ sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) H = m512_const1_64( 0x5BE0CD195BE0CD19 ); } - Y_xor_Z = _mm512_xor_si512( B, C ); SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); @@ -834,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) high = (sc->count_high << 3) | (low >> 29); low = low << 3; - sc->buf[ pad >> 2 ] = - mm512_bswap_32( m512_const1_32( high ) ); - sc->buf[ ( pad+4 ) >> 2 ] = - mm512_bswap_32( m512_const1_32( low ) ); + sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) ); + sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) ); sha256_16way_round( sc, sc->buf, sc->val ); diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c index fb049b1..78bda65 100644 --- a/algo/sha/sha256-hash-opt.c +++ b/algo/sha/sha256-hash-opt.c @@ -3,23 +3,24 @@ /* Based on code from Intel, and by Sean Gulley for */ /* the miTLS project. */ -// A drop in replacement for the function of the same name in sph_sha2.c. +// A stripped down version with byte swapping removed. #if defined(__SHA__) -#include "simd-utils.h" +#include "sha256-hash-opt.h" -static void sha2_round( const uint8_t input[], uint32_t state[8] ) +void sha256_opt_transform( uint32_t *state_out, const void *input, + const uint32_t *state_in ) { __m128i STATE0, STATE1; - __m128i MSG, TMP, MASK; + __m128i MSG, TMP; __m128i TMSG0, TMSG1, TMSG2, TMSG3; __m128i ABEF_SAVE, CDGH_SAVE; // Load initial values - TMP = _mm_load_si128((__m128i*) &state[0]); - STATE1 = _mm_load_si128((__m128i*) &state[4]); - MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + TMP = _mm_load_si128((__m128i*) &state_in[0]); + STATE1 = _mm_load_si128((__m128i*) &state_in[4]); +// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH @@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) CDGH_SAVE = STATE1; // Rounds 0-3 - MSG = _mm_load_si128((const __m128i*) (input+0)); - TMSG0 = _mm_shuffle_epi8(MSG, MASK); + TMSG0 = _mm_load_si128((const __m128i*) (input+0)); +// TMSG0 = _mm_shuffle_epi8(MSG, MASK); MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); MSG = _mm_shuffle_epi32(MSG, 0x0E); @@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) // Rounds 4-7 TMSG1 = _mm_load_si128((const __m128i*) (input+16)); - TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); +// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); MSG = _mm_shuffle_epi32(MSG, 0x0E); @@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) // Rounds 8-11 TMSG2 = _mm_load_si128((const __m128i*) (input+32)); - TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); +// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); MSG = _mm_shuffle_epi32(MSG, 0x0E); @@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) // Rounds 12-15 TMSG3 = _mm_load_si128((const __m128i*) (input+48)); - TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); +// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); @@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] ) STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF // Save state - _mm_store_si128((__m128i*) &state[0], STATE0); - _mm_store_si128((__m128i*) &state[4], STATE1); + _mm_store_si128((__m128i*) &state_out[0], STATE0); + _mm_store_si128((__m128i*) &state_out[4], STATE1); } - #endif diff --git a/algo/sha/sha256-hash-opt.h b/algo/sha/sha256-hash-opt.h new file mode 100644 index 0000000..9ceacf4 --- /dev/null +++ b/algo/sha/sha256-hash-opt.h @@ -0,0 +1,18 @@ +#ifndef SHA2_HASH_OPT_H__ +#define SHA2_HASH_OPT_H__ 1 + +#include +#include "simd-utils.h" + +#if defined(__SHA__) + +void sha256_opt_transform( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +// 2 way with interleaved instructions +void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +#endif +#endif diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c new file mode 100644 index 0000000..9bbc5c8 --- /dev/null +++ b/algo/sha/sha256d-4way.c @@ -0,0 +1,252 @@ +#include "sha256t-gate.h" +#include +#include +#include +#include +#include "sha-hash-4way.h" + +#if defined(SHA256D_16WAY) + +int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m512i block[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (32))); + __m512i initstate[8] __attribute__ ((aligned (32))); + __m512i midstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + __m512i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 16; + uint32_t n = first_nonce; + __m512i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m512i last_byte = m512_const1_32( 0x80000000 ); + const __m512i sixteen = m512_const1_32( 16 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m512_const1_32( pdata[i] ); + + *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, + n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + // initialize state + initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m512_const1_64( 0x510E527F510E527F ); + initstate[5] = m512_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_16way_transform( midstate, vdata, initstate ); + + do + { + // 1. final 16 bytes of data, with padding + memcpy_512( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_512( block + 5, 10 ); + block[15] = m512_const1_32( 80*8 ); // bit count + sha256_16way_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy_512( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + sha256_16way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm512_add_epi32( *noncev, sixteen ); + n += 16; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + + +#endif + +#if defined(SHA256D_8WAY) + +int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m256i block[16] __attribute__ ((aligned (64))); + __m256i hash32[8] __attribute__ ((aligned (32))); + __m256i initstate[8] __attribute__ ((aligned (32))); + __m256i midstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + __m256i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m256i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m256i last_byte = m256_const1_32( 0x80000000 ); + const __m256i eight = m256_const1_32( 8 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m256_const1_32( pdata[i] ); + + *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + // initialize state + initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m256_const1_64( 0x510E527F510E527F ); + initstate[5] = m256_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_8way_transform( midstate, vdata, initstate ); + + do + { + // 1. final 16 bytes of data, with padding + memcpy_256( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_256( block + 5, 10 ); + block[15] = m256_const1_32( 80*8 ); // bit count + sha256_8way_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy_256( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + sha256_8way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm256_add_epi32( *noncev, eight ); + n += 8; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256D_4WAY) + +int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m128i block[16] __attribute__ ((aligned (64))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + __m128i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + __m128i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); + + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform( midstate, vdata, initstate ); + + do + { + // 1. final 16 bytes of data, with padding + memcpy_128( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_128( block + 5, 10 ); + block[15] = m128_const1_32( 80*8 ); // bit count + sha256_4way_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy_128( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + sha256_4way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index eb11744..0f4fb58 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -7,133 +7,173 @@ #if defined(SHA256T_16WAY) -static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64))); - -void sha256t_16way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*16] __attribute__ ((aligned (64))); - sha256_16way_context ctx; - memcpy( &ctx, &sha256_ctx16, sizeof ctx ); - - sha256_16way_update( &ctx, input + (64<<4), 16 ); - sha256_16way_close( &ctx, vhash ); - - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, vhash ); - - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, output ); -} - int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*16] __attribute__ ((aligned (64))); - uint32_t hash32[8*16] __attribute__ ((aligned (32))); + __m512i block[16] __attribute__ ((aligned (64))); + __m512i hash32[8] __attribute__ ((aligned (32))); + __m512i initstate[8] __attribute__ ((aligned (32))); + __m512i midstate[8] __attribute__ ((aligned (32))); + __m512i midstate2[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash32_d7 = &(hash32[7<<4]); + __m512i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 16; uint32_t n = first_nonce; - __m512i *noncev = (__m512i*)vdata + 19; // aligned + __m512i *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m512i last_byte = m512_const1_32( 0x80000000 ); + const __m512i sixteen = m512_const1_32( 16 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m512_const1_32( pdata[i] ); - mm512_bswap32_intrlv80_16x32( vdata, pdata ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); - sha256_16way_init( &sha256_ctx16 ); - sha256_16way_update( &sha256_ctx16, vdata, 64 ); + + // initialize state + initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m512_const1_64( 0x510E527F510E527F ); + initstate[5] = m512_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 byte block of data + sha256_16way_transform( midstate, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate ); do { - pdata[19] = n; - sha256t_16way_hash( hash32, vdata ); - for ( int lane = 0; lane < 16; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) - { - extr_lane_16x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, lane_hash, mythr ); - } - } - *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); - n += 16; + // 1. final 16 bytes of data, with padding + memcpy_512( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_512( block + 5, 10 ); + block[15] = m512_const1_32( 80*8 ); // bit count + sha256_16way_final_rounds( hash32, block, midstate, midstate2 ); +// sha256_16way_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy_512( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + sha256_16way_transform( hash32, block, initstate ); + + // 3. 32 byte hash from 2. + memcpy_512( block, hash32, 8 ); + sha256_16way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm512_add_epi32( *noncev, sixteen ); + n += 16; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; return 0; } + #endif #if defined(SHA256T_8WAY) -static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); - -void sha256t_8way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*8] __attribute__ ((aligned (64))); - sha256_8way_context ctx; - memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - - sha256_8way_update( &ctx, input + (64<<3), 16 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); - - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, output ); -} - int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*8] __attribute__ ((aligned (64))); - uint32_t hash32[8*8] __attribute__ ((aligned (32))); + __m256i block[16] __attribute__ ((aligned (64))); + __m256i hash32[8] __attribute__ ((aligned (32))); + __m256i initstate[8] __attribute__ ((aligned (32))); + __m256i midstate[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t *hash32_d7 = &(hash32[7<<3]); + __m256i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; - __m256i *noncev = (__m256i*)vdata + 19; // aligned + __m256i *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m256i last_byte = m256_const1_32( 0x80000000 ); + const __m256i eight = m256_const1_32( 8 ); - mm256_bswap32_intrlv80_8x32( vdata, pdata ); - *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); - sha256_8way_init( &sha256_ctx8 ); - sha256_8way_update( &sha256_ctx8, vdata, 64 ); + for ( int i = 0; i < 19; i++ ) + vdata[i] = m256_const1_32( pdata[i] ); + + *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + + // initialize state + initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m256_const1_64( 0x510E527F510E527F ); + initstate[5] = m256_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_8way_transform( midstate, vdata, initstate ); do { - pdata[19] = n; - sha256t_8way_hash( hash32, vdata ); - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) - { - extr_lane_8x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, lane_hash, mythr ); - } - } - *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) ); - n += 8; + // 1. final 16 bytes of data, with padding + memcpy_256( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_256( block + 5, 10 ); + block[15] = m256_const1_32( 80*8 ); // bit count + sha256_8way_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy_256( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + sha256_8way_transform( hash32, block, initstate ); + + // 3. 32 byte hash from 2. + memcpy_256( block, hash32, 8 ); + sha256_8way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm256_add_epi32( *noncev, eight ); + n += 8; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; @@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, #if defined(SHA256T_4WAY) -static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); - -void sha256t_4way_hash( void* output, const void* input ) -{ - uint32_t vhash[8*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx; - memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - - sha256_4way_update( &ctx, input + (64<<2), 16 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); - - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, output ); -} - int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[8*4] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (64))); - uint32_t *hash7 = &(hash[7<<2]); + __m128i block[16] __attribute__ ((aligned (64))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + __m128i vdata[20] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; + const uint32_t targ32_d7 = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned + __m128i *noncev = vdata + 19; const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); - const uint64_t htmax[] = { 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 }; - const uint32_t masks[] = { 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 }; + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - sha256_4way_init( &sha256_ctx4 ); - sha256_4way_update( &sha256_ctx4, vdata, 64 ); + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform( midstate, vdata, initstate ); + + do { - const uint32_t mask = masks[m]; - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); - pdata[19] = n; + // 1. final 16 bytes of data, with padding + memcpy_128( block, vdata + 16, 4 ); + block[ 4] = last_byte; + memset_zero_128( block + 5, 10 ); + block[15] = m128_const1_32( 80*8 ); // bit count + sha256_4way_transform( hash32, block, midstate ); - sha256t_4way_hash( hash, vdata ); + // 2. 32 byte hash from 1. + memcpy_128( block, hash32, 8 ); + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + sha256_4way_transform( hash32, block, initstate ); - for ( int lane = 0; lane < 4; lane++ ) - if ( !( hash7[ lane ] & mask ) ) + // 3. 32 byte hash from 2. + memcpy_128( block, hash32, 8 ); + sha256_4way_transform( hash32, block, initstate ); + + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - extr_lane_4x32( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); - } - } - n += 4; - } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index 166efe2..e05c706 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256t_16way; - gate->hash = (void*)&sha256t_16way_hash; #elif defined(__SHA__) gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256t; - gate->hash = (void*)&sha256t_hash; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256t_8way; - gate->hash = (void*)&sha256t_8way_hash; #else gate->scanhash = (void*)&scanhash_sha256t_4way; - gate->hash = (void*)&sha256t_4way_hash; #endif return true; } diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 46266f2..e74cfd1 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate ); #if defined(SHA256T_16WAY) -void sha256t_16way_hash( void *output, const void *input ); int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_16way_hash( void *output, const void *input ); @@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce, #if defined(SHA256T_8WAY) -void sha256t_8way_hash( void *output, const void *input ); int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_8way_hash( void *output, const void *input ); @@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce, #if defined(SHA256T_4WAY) -void sha256t_4way_hash( void *output, const void *input ); int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void sha256q_4way_hash( void *output, const void *input ); @@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif +#if defined(__SHA__) -int sha256t_hash( void *output, const void *input ); int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + int sha256q_hash( void *output, const void *input ); int scanhash_sha256q( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c index bd4edf0..90d2754 100644 --- a/algo/sha/sha256t.c +++ b/algo/sha/sha256t.c @@ -3,10 +3,14 @@ #include #include #include -#include "algo/sha/sph_sha2.h" +//#include "algo/sha/sph_sha2.h" +#include "sha256-hash-opt.h" + +#if defined(__SHA__) // Only used on CPUs with SHA +/* static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64))); void sha256t_midstate( const void* input ) @@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input ) return 1; } +*/ +/* int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t edata[20] __attribute__((aligned(64))); - uint32_t hash[8] __attribute__((aligned(64))); + uint32_t block[16] __attribute__ ((aligned (64))); + uint32_t hash32[8] __attribute__ ((aligned (32))); + uint32_t initstate[8] __attribute__ ((aligned (32))); + uint32_t midstate[8] __attribute__ ((aligned (32))); + + + +// uint32_t edata[20] __attribute__((aligned(64))); +// uint32_t hash[8] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; + __m128i shuf_bswap32 = + _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - mm128_bswap32_80( edata, pdata ); - sha256t_midstate( edata ); +// mm128_bswap32_80( edata, pdata ); +// sha256t_midstate( edata ); + + // initialize state + initstate[0] = 0x6A09E667; + initstate[1] = 0xBB67AE85; + initstate[2] = 0x3C6EF372; + initstate[3] = 0xA54FF53A; + initstate[4] = 0x510E527F; + initstate[5] = 0x9B05688C; + initstate[6] = 0x1F83D9AB; + initstate[7] = 0x5BE0CD19; + + // hash first 64 bytes of data + sha256_opt_transform( midstate, pdata, initstate ); do { - edata[19] = n; - if ( likely( sha256t_hash( hash, edata ) ) ) - if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) - { - pdata[19] = bswap_32( n ); - submit_solution( work, hash, mythr ); - } + // 1. final 16 bytes of data, with padding + memcpy( block, pdata + 16, 16 ); + block[ 4] = 0x80000000; + memset( block + 5, 0, 40 ); + block[15] = 80*8; // bit count + sha256_opt_transform( hash32, block, midstate ); + + // 2. 32 byte hash from 1. + memcpy( block, hash32, 32 ); + block[ 8] = 0x80000000; + memset( block + 9, 0, 24 ); + block[15] = 32*8; // bit count + sha256_opt_transform( hash32, block, initstate ); + + // 3. 32 byte hash from 2. + memcpy( block, hash32, 32 ); + sha256_opt_transform( hash32, block, initstate ); + + // byte swap final hash for testing + casti_m128i( hash32, 0 ) = + _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 ); + casti_m128i( hash32, 1 ) = + _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 ); + + if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) + submit_solution( work, hash32, mythr ); n++; - } while ( n < last_nonce && !work_restart[thr_id].restart ); + pdata[19] = n; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} +*/ + +int scanhash_sha256t( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block0[16] __attribute__ ((aligned (64))); + uint32_t block1[16] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (32))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t initstate[8] __attribute__ ((aligned (32))); + uint32_t midstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 1; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + __m128i shuf_bswap32 = + _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // initialize state + initstate[0] = 0x6A09E667; + initstate[1] = 0xBB67AE85; + initstate[2] = 0x3C6EF372; + initstate[3] = 0xA54FF53A; + initstate[4] = 0x510E527F; + initstate[5] = 0x9B05688C; + initstate[6] = 0x1F83D9AB; + initstate[7] = 0x5BE0CD19; + + // hash first 64 bytes of data + sha256_opt_transform( midstate, pdata, initstate ); + + do + { + // 1. final 16 bytes of data, with padding + memcpy( block0, pdata + 16, 16 ); + memcpy( block1, pdata + 16, 16 ); + block0[ 3] = n; + block1[ 3] = n+1; + block0[ 4] = block1[ 4] = 0x80000000; + memset( block0 + 5, 0, 40 ); + memset( block1 + 5, 0, 40 ); + block0[15] = block1[15] = 80*8; // bit count + sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate ); + + // 2. 32 byte hash from 1. + memcpy( block0, hash0, 32 ); + memcpy( block1, hash1, 32 ); + block0[ 8] = block1[ 8] = 0x80000000; + memset( block0 + 9, 0, 24 ); + memset( block1 + 9, 0, 24 ); + block0[15] = block1[15] = 32*8; // bit count + sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate ); + + // 3. 32 byte hash from 2. + memcpy( block0, hash0, 32 ); + memcpy( block1, hash1, 32 ); + sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate ); + + // byte swap final hash for testing + casti_m128i( hash0, 0 ) = + _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 ); + casti_m128i( hash0, 1 ) = + _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 ); + casti_m128i( hash1, 0 ) = + _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 ); + casti_m128i( hash1, 1 ) = + _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 ); + + if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hash0, mythr ); + } + if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hash1, mythr ); + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } +#endif diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 803c42f..e41a92b 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -96,86 +96,22 @@ static const uint64_t K512[80] = // SHA-512 8 way 64 bit #define CH8W(X, Y, Z) \ - _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) - -/* -#define MAJ8W(X, Y, Z) \ - _mm512_or_si512( _mm512_and_si512( X, Y ), \ - _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) -*/ -/* Functionally identical to original but optimizable, - * subexpression X^Y from one step can be reused in the next step as Y^Z -#define MAJ8W(X, Y, Z) \ - _mm512_xor_si512( Y, _mm512_and_si512( _mm512_xor_si512( X, Y ), \ - _mm512_xor_si512( Y, Z ) ) ) -*/ + _mm512_ternarylogic_epi64( X, Y, Z, 0xca ) #define MAJ8W(X, Y, Z) \ - _mm512_xor_si512( Y, _mm512_and_si512( X_xor_Y = _mm512_xor_si512( X, Y ), \ - Y_xor_Z ) ) + _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 ) #define BSG8W_5_0(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) ) + mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) ) #define BSG8W_5_1(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) ) + mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) ) #define SSG8W_5_0(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) ) + mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) ) #define SSG8W_5_1(x) \ - _mm512_xor_si512( _mm512_xor_si512( \ - mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) ) - -static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 ) -{ - __m512i w0a, w1a, w0b, w1b; - w0a = mm512_ror_64( w0, 1 ); - w1a = mm512_ror_64( w1,19 ); - w0b = mm512_ror_64( w0, 8 ); - w1b = mm512_ror_64( w1,61 ); - w0a = _mm512_xor_si512( w0a, w0b ); - w1a = _mm512_xor_si512( w1a, w1b ); - w0b = _mm512_srli_epi64( w0, 7 ); - w1b = _mm512_srli_epi64( w1, 6 ); - w0a = _mm512_xor_si512( w0a, w0b ); - w1a = _mm512_xor_si512( w1a, w1b ); - return _mm512_add_epi64( w0a, w1a ); -} - - -#define SSG8W_512x2_0( w0, w1, i ) do \ -{ \ - __m512i X0a, X1a, X0b, X1b; \ - X0a = mm512_ror_64( W[i-15], 1 ); \ - X1a = mm512_ror_64( W[i-14], 1 ); \ - X0b = mm512_ror_64( W[i-15], 8 ); \ - X1b = mm512_ror_64( W[i-14], 8 ); \ - X0a = _mm512_xor_si512( X0a, X0b ); \ - X1a = _mm512_xor_si512( X1a, X1b ); \ - X0b = _mm512_srli_epi64( W[i-15], 7 ); \ - X1b = _mm512_srli_epi64( W[i-14], 7 ); \ - w0 = _mm512_xor_si512( X0a, X0b ); \ - w1 = _mm512_xor_si512( X1a, X1b ); \ -} while(0) - -#define SSG8W_512x2_1( w0, w1, i ) do \ -{ \ - __m512i X0a, X1a, X0b, X1b; \ - X0a = mm512_ror_64( W[i-2],19 ); \ - X1a = mm512_ror_64( W[i-1],19 ); \ - X0b = mm512_ror_64( W[i-2],61 ); \ - X1b = mm512_ror_64( W[i-1],61 ); \ - X0a = _mm512_xor_si512( X0a, X0b ); \ - X1a = _mm512_xor_si512( X1a, X1b ); \ - X0b = _mm512_srli_epi64( W[i-2], 6 ); \ - X1b = _mm512_srli_epi64( W[i-1], 6 ); \ - w0 = _mm512_xor_si512( X0a, X0b ); \ - w1 = _mm512_xor_si512( X1a, X1b ); \ -} while(0) + mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) ) #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \ do { \ @@ -184,7 +120,6 @@ do { \ T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \ K, W[i] ) ); \ T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \ - Y_xor_Z = X_xor_Y; \ D = _mm512_add_epi64( D, T1 ); \ H = _mm512_add_epi64( T1, T2 ); \ } while (0) @@ -193,15 +128,15 @@ static void sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) { int i; - register __m512i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + register __m512i A, B, C, D, E, F, G, H; __m512i W[80]; mm512_block_bswap_64( W , in ); mm512_block_bswap_64( W+8, in+8 ); for ( i = 16; i < 80; i++ ) - W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ), - _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) ); + W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ), + W[ i- 7 ], W[ i-16 ] ); if ( ctx->initialized ) { @@ -226,8 +161,6 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) H = m512_const1_64( 0x5BE0CD19137E2179 ); } - Y_xor_Z = _mm512_xor_si512( B, C ); - for ( i = 0; i < 80; i += 8 ) { SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c index b67b014..5e70c3e 100644 --- a/algo/sha/sph_sha2.c +++ b/algo/sha/sph_sha2.c @@ -73,7 +73,194 @@ static const sph_u32 H256[8] = { #if defined(__SHA__) -#include "sha256-hash-opt.c" +#include "simd-utils.h" + +static void sha2_round( const uint8_t input[], uint32_t state[8] ) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_load_si128((__m128i*) &state[0]); + STATE1 = _mm_load_si128((__m128i*) &state[4]); + MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + MSG = _mm_load_si128((const __m128i*) (input+0)); + TMSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_load_si128((const __m128i*) (input+16)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 8-11 + TMSG2 = _mm_load_si128((const __m128i*) (input+32)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_load_si128((const __m128i*) (input+48)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_store_si128((__m128i*) &state[0], STATE0); + _mm_store_si128((__m128i*) &state[4], STATE1); +} #else // no SHA diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index dffa18d..c53cb39 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -310,12 +310,13 @@ do { \ #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ do { \ - xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \ + xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \ _mm256_andnot_si256( xb3, xb2 ), \ - _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \ - _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \ - ) ), _mm256_set1_epi32(3UL) ) ) ) ); \ - xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \ + _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ + _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \ + _mm256_set1_epi32(5UL) ) ), \ + _mm256_set1_epi32(3UL) ) ) ); \ + xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \ } while (0) #define PERM_STEP_0_8 do { \ diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index d7cd470..711d8ac 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -309,22 +309,16 @@ static const uint64_t IV512[] = { sc->bcount = bcount; \ } while (0) -// AVX2 all scalar vars are now vectors representing 4 nonces in parallel - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \ do { \ - k8 = _mm512_xor_si512( _mm512_xor_si512( \ - _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \ - _mm512_xor_si512( k2, k3 ) ), \ - _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \ - _mm512_xor_si512( k6, k7 ) ) ), \ - m512_const1_64( 0x1BD11BDAA9FC1A22) ); \ + k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \ + mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\ t2 = t0 ^ t1; \ } while (0) - + #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \ do { \ w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \ @@ -340,7 +334,6 @@ do { \ m512_const1_64( s ) ) ); \ } while (0) - #define TFBIG_MIX_8WAY(x0, x1, rc) \ do { \ x0 = _mm512_add_epi64( x0, x1 ); \ diff --git a/configure b/configure index 403892f..8382a1b 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.5. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.16.5' -PACKAGE_STRING='cpuminer-opt 3.16.5' +PACKAGE_VERSION='3.17.0' +PACKAGE_STRING='cpuminer-opt 3.17.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.16.5 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.17.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.16.5:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.17.0:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.16.5 +cpuminer-opt configure 3.17.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.16.5, which was +It was created by cpuminer-opt $as_me 3.17.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.16.5' + VERSION='3.17.0' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.16.5, which was +This file was extended by cpuminer-opt $as_me 3.17.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.16.5 +cpuminer-opt config.status 3.17.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index c0aca33..f5612ef 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.16.5]) +AC_INIT([cpuminer-opt], [3.17.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index e46d920..9b72376 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2093,10 +2093,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) sctx->block_height, net_diff, g_work->job_id ); else if ( !opt_quiet ) { - unsigned char *xnonce2str = abin2hex( g_work->xnonce2, - g_work->xnonce2_len ); - applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g", - xnonce2str, sctx->block_height, net_diff ); + unsigned char *xnonce2str = bebin2hex( g_work->xnonce2, + g_work->xnonce2_len ); + applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s", + xnonce2str, sctx->block_height, g_work->job_id ); free( xnonce2str ); } diff --git a/miner.h b/miner.h index bea4f68..9ca56b8 100644 --- a/miner.h +++ b/miner.h @@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass, extern void cbin2hex(char *out, const char *in, size_t len); void bin2hex( char *s, const unsigned char *p, size_t len ); char *abin2hex( const unsigned char *p, size_t len ); +char *bebin2hex( const unsigned char *p, size_t len ); bool hex2bin( unsigned char *p, const char *hexstr, size_t len ); bool jobj_binary( const json_t *obj, const char *key, void *buf, size_t buflen ); diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 7a37012..1b9fca8 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #define mm256_add4_8( a, b, c, d ) \ _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) ) +#if defined(__AVX512VL__) + +// AVX512 has ternary logic that supports any 3 input boolean expression. + +// a ^ b ^ c +#define mm256_xor3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x96 ) + +// legacy convenience only +#define mm256_xor4( a, b, c, d ) \ + _mm256_xor_si256( a, mm256_xor3( b, c, d ) ) + +// a & b & c +#define mm256_and3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x80 ) + +// a | b | c +#define mm256_or3( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xfe ) + +// a ^ ( b & c ) +#define mm256_xorand( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x78 ) + +// a & ( b ^ c ) +#define mm256_andxor( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x60 ) + +// a ^ ( b | c ) +#define mm256_xoror( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0x1e ) + +// a ^ ( ~b & c ) +#define mm256_xorandnot( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xd2 ) + +// a | ( b & c ) +#define mm256_orand( a, b, c ) \ + _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) + +// ~( a ^ b ), same as (~a) ^ b +#define mm256_xnor( a, b ) \ + _mm256_ternarylogic_epi64( a, b, b, 0x81 ) + +#else + +#define mm256_xor3( a, b, c ) \ + _mm256_xor_si256( a, _mm256_xor_si256( b, c ) ) + #define mm256_xor4( a, b, c, d ) \ _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) ) +#define mm256_and3( a, b, c ) \ + _mm256_and_si256( a, _mm256_and_si256( b, c ) ) + +#define mm256_or3( a, b, c ) \ + _mm256_or_si256( a, _mm256_or_si256( b, c ) ) + +#define mm256_xorand( a, b, c ) \ + _mm256_xor_si256( a, _mm256_and_si256( b, c ) ) + +#define mm256_andxor( a, b, c ) \ + _mm256_and_si256( a, _mm256_xor_si256( b, c )) + +#define mm256_xoror( a, b, c ) \ + _mm256_xor_si256( a, _mm256_or_si256( b, c ) ) + +#define mm256_xorandnot( a, b, c ) \ + _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) ) + +#define mm256_orand( a, b, c ) \ + _mm256_or_si256( a, _mm256_and_si256( b, c ) ) + +#define mm256_xnor( a, b ) \ + mm256_not( _mm256_xor_si256( a, b ) ) + +#endif + // // Bit rotations. // diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 22c5331..e6b7ac2 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -61,7 +61,7 @@ // // Additionally, permutations using smaller vectors can be more efficient // if the permutation doesn't cross lane boundaries, typically 128 bits, -// and the smnaller vector can use an imm comtrol. +// and the smaller vector can use an imm comtrol. // // If the permutation doesn't cross lane boundaries a shuffle instructions // can be used with imm control instead of permute. @@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, return v.m512i; } -// Equivalent of set1, broadcast lo element all elements. +// Equivalent of set1, broadcast lo element to all elements. static inline __m512i m512_const1_256( const __m256i v ) { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); } @@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, // Basic operations without SIMD equivalent // ~x -#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) +// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) +static inline __m512i mm512_not( const __m512i x ) +{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); } // -x #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) @@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_add4_8( a, b, c, d ) \ _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) ) -#define mm512_xor4( a, b, c, d ) \ - _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) ) - - // +// Ternary logic uses 8 bit truth table to define any 3 input logical +// operation using any number or combinations of AND, OR XOR, NOT. + +// a ^ b ^ c +#define mm512_xor3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x96 ) + +// legacy convenience only +#define mm512_xor4( a, b, c, d ) \ + _mm512_xor_si512( a, mm512_xor3( b, c, d ) ) + +// a & b & c +#define mm512_and3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x80 ) + +// a | b | c +#define mm512_or3( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xfe ) + +// a ^ ( b & c ) +#define mm512_xorand( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x78 ) + +// a & ( b ^ c ) +#define mm512_andxor( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x60 ) + +// a ^ ( b & c ) +#define mm512_xoror( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0x1e ) + +// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ] +#define mm512_xorandnot( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) + +// a | ( b & c ) +#define mm512_orand( a, b, c ) \ + _mm512_ternarylogic_epi64( a, b, c, 0xf8 ) + +// Some 2 input operations that don't have their own instruction mnemonic. + +// ~( a | b ) +#define mm512_nor( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0x01 ) + +// ~( a ^ b ), same as (~a) ^ b +#define mm512_xnor( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0x81 ) + +// ~( a & b ) +#define mm512_nand( a, b ) \ + _mm512_ternarylogic_epi64( a, b, b, 0xef ) + + // Bit rotations. // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit diff --git a/util.c b/util.c index 2bfc809..b96c4fe 100644 --- a/util.c +++ b/util.c @@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len) return s; } +char *bebin2hex(const unsigned char *p, size_t len) +{ + char *s = (char*) malloc((len * 2) + 1); + if (!s) return NULL; + for ( size_t i = 0, j = len - 1; i < len; i++, j-- ) + sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] ); + return s; +} + bool hex2bin(unsigned char *p, const char *hexstr, size_t len) { char hex_byte[3];