diff --git a/Makefile.am b/Makefile.am index 36e208a..db71cf1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,7 @@ cpuminer_SOURCES = \ api.c \ sysinfos.c \ algo-gate-api.c\ + malloc-huge.c \ algo/argon2/argon2a/argon2a.c \ algo/argon2/argon2a/ar2/argon2.c \ algo/argon2/argon2a/ar2/opt.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index a8a5e1a..9f3fb6a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,12 @@ If not what makes it happen or not happen? Change Log ---------- +v3.19.3 + +Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available. + +Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512. + v3.19.2 Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1. diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c index 3182930..5164a1e 100644 --- a/algo/argon2/argon2d/argon2d/opt.c +++ b/algo/argon2/argon2d/argon2d/opt.c @@ -37,6 +37,13 @@ #if defined(__AVX512F__) +static inline __m512i blamka( __m512i x, __m512i y ) +{ + __m512i xy = _mm512_mul_epu32( x, y ); + return _mm512_add_epi64( _mm512_add_epi64( x, y ), + _mm512_add_epi64( xy, xy ) ); +} + static void fill_block( __m512i *state, const block *ref_block, block *next_block, int with_xor ) { diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 809961c..4cb8bda 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { #include -#define ROR64(x, n) _mm512_ror_epi64((x), (n)) - -static __m512i muladd(__m512i x, __m512i y) +static inline __m512i muladd(__m512i x, __m512i y) { __m512i z = _mm512_mul_epu32(x, y); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); @@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ROR64(D0, 32); \ - D1 = ROR64(D1, 32); \ + D0 = _mm512_ror_epi64(D0, 32); \ + D1 = _mm512_ror_epi64(D1, 32); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ROR64(B0, 24); \ - B1 = ROR64(B1, 24); \ + B0 = _mm512_ror_epi64(B0, 24); \ + B1 = _mm512_ror_epi64(B1, 24); \ } while ((void)0, 0) #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y) D0 = _mm512_xor_si512(D0, A0); \ D1 = _mm512_xor_si512(D1, A1); \ \ - D0 = ROR64(D0, 16); \ - D1 = ROR64(D1, 16); \ + D0 = _mm512_ror_epi64(D0, 16); \ + D1 = _mm512_ror_epi64(D1, 16); \ \ C0 = muladd(C0, D0); \ C1 = muladd(C1, D1); \ @@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y) B0 = _mm512_xor_si512(B0, C0); \ B1 = _mm512_xor_si512(B1, C1); \ \ - B0 = ROR64(B0, 63); \ - B1 = ROR64(B1, 63); \ + B0 = _mm512_ror_epi64(B0, 63); \ + B1 = _mm512_ror_epi64(B1, 63); \ } while ((void)0, 0) #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y) #define SWAP_HALVES(A0, A1) \ do { \ - __m512i t0, t1; \ - t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ - t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ - A0 = t0; \ - A1 = t1; \ + __m512i t; \ + t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ + A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ + A0 = t; \ } while((void)0, 0) #define SWAP_QUARTERS(A0, A1) \ diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index b7b7c70..38bf076 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = { #define sE c7 #define sF m7 - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // Hamsi 8 way AVX512 -// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero -// produces the same result but is faster. +// Intel says _mm512_movepi64_mask has (1L/1T) timimg while +// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13 +// on i9-9940x cmplt with zero was 3% faster than movepi. + #define INPUT_BIG8 \ do { \ __m512i db = _mm512_ror_epi64( *buf, 1 ); \ + const __m512i zero = m512_zero; \ const uint64_t *tp = (const uint64_t*)T512; \ - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \ for ( int u = 0; u < 64; u++ ) \ { \ - __mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \ + const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \ m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \ m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \ m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \ @@ -573,29 +575,6 @@ do { \ } \ } while (0) -/* -#define INPUT_BIG8 \ -do { \ - __m512i db = *buf; \ - const uint64_t *tp = (const uint64_t*)T512; \ - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ - for ( int u = 0; u < 64; u++ ) \ - { \ - __m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \ - m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \ - m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \ - m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \ - m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \ - m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \ - m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \ - m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \ - m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \ - tp += 8; \ - db = _mm512_srli_epi64( db, 1 ); \ - } \ -} while (0) -*/ - #define SBOX8( a, b, c, d ) \ do { \ __m512i t; \ @@ -632,199 +611,192 @@ do { \ #define READ_STATE_BIG8(sc) \ do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ + c0 = sc->h[0]; \ + c1 = sc->h[1]; \ + c2 = sc->h[2]; \ + c3 = sc->h[3]; \ + c4 = sc->h[4]; \ + c5 = sc->h[5]; \ + c6 = sc->h[6]; \ + c7 = sc->h[7]; \ } while (0) #define WRITE_STATE_BIG8(sc) \ do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ + sc->h[0] = c0; \ + sc->h[1] = c1; \ + sc->h[2] = c2; \ + sc->h[3] = c3; \ + sc->h[4] = c4; \ + sc->h[5] = c5; \ + sc->h[6] = c6; \ + sc->h[7] = c7; \ } while (0) - #define ROUND_BIG8( alpha ) \ do { \ __m512i t0, t1, t2, t3; \ - s0 = _mm512_xor_si512( s0, alpha[ 0] ); \ - s1 = _mm512_xor_si512( s1, alpha[ 1] ); \ - s2 = _mm512_xor_si512( s2, alpha[ 2] ); \ - s3 = _mm512_xor_si512( s3, alpha[ 3] ); \ - s4 = _mm512_xor_si512( s4, alpha[ 4] ); \ - s5 = _mm512_xor_si512( s5, alpha[ 5] ); \ - s6 = _mm512_xor_si512( s6, alpha[ 6] ); \ - s7 = _mm512_xor_si512( s7, alpha[ 7] ); \ - s8 = _mm512_xor_si512( s8, alpha[ 8] ); \ - s9 = _mm512_xor_si512( s9, alpha[ 9] ); \ - sA = _mm512_xor_si512( sA, alpha[10] ); \ - sB = _mm512_xor_si512( sB, alpha[11] ); \ - sC = _mm512_xor_si512( sC, alpha[12] ); \ - sD = _mm512_xor_si512( sD, alpha[13] ); \ - sE = _mm512_xor_si512( sE, alpha[14] ); \ - sF = _mm512_xor_si512( sF, alpha[15] ); \ + s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \ + s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \ + s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \ + s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \ + s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \ + s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \ + s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \ + s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \ + s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \ + s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \ + sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \ + sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \ + sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \ + sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \ + sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \ + sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \ \ - SBOX8( s0, s4, s8, sC ); \ - SBOX8( s1, s5, s9, sD ); \ - SBOX8( s2, s6, sA, sE ); \ - SBOX8( s3, s7, sB, sF ); \ + SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \ + SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \ + SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ + SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \ - _mm512_bslli_epi128( s5, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \ - _mm512_bslli_epi128( sE, 4 ) ); \ + s4 = mm512_swap64_32( s4 ); \ + s5 = mm512_swap64_32( s5 ); \ + sD = mm512_swap64_32( sD ); \ + sE = mm512_swap64_32( sE ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ L8( s0, t1, s9, t3 ); \ - s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \ - s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \ - sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \ + s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \ + s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \ + sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \ + sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ - _mm512_bslli_epi128( s6, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \ - _mm512_bslli_epi128( sF, 4 ) ); \ + s6 = mm512_swap64_32( s6 ); \ + sF = mm512_swap64_32( sF ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ L8( s1, t1, sA, t3 ); \ - s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ - s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \ - sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \ - sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \ + s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \ + sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \ + sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \ - _mm512_bslli_epi128( s7, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \ - _mm512_bslli_epi128( sC, 4 ) ); \ + s7 = mm512_swap64_32( s7 ); \ + sC = mm512_swap64_32( sC ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \ L8( s2, t1, sB, t3 ); \ - s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \ - s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \ - sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \ - sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \ + s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \ + s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \ + sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \ + sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \ + s6 = mm512_swap64_32( s6 ); \ + sF = mm512_swap64_32( sF ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \ - _mm512_bslli_epi128( s4, 4 ) ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \ - _mm512_bslli_epi128( sD, 4 ) ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ L8( s3, t1, s8, t3 ); \ - s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \ - s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \ - sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \ + s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \ + s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \ + sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \ + s7 = mm512_swap64_32( s7 ); \ + sC = mm512_swap64_32( sC ); \ \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \ - _mm512_bslli_epi128( sB, 4 ) ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \ + t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \ + t3 = mm512_swap64_32( t3 ); \ L8( t0, t1, t2, t3 ); \ + t3 = mm512_swap64_32( t3 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ - s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \ + s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ - s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \ + s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ - s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \ - sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \ + s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \ + sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \ \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ - _mm512_bslli_epi128( sD, 4 ) ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \ L8( t0, t1, t2, t3 ); \ - s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \ + s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \ sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \ - s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ - sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \ - sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \ + sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \ sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ + s4 = mm512_swap64_32( s4 ); \ + s5 = mm512_swap64_32( s5 ); \ + sD = mm512_swap64_32( sD ); \ + sE = mm512_swap64_32( sE ); \ } while (0) #define P_BIG8 \ do { \ __m512i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ for( int i = 0; i < 16; i++ ) \ alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ } while (0) #define PF_BIG8 \ do { \ __m512i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ for( int i = 0; i < 16; i++ ) \ alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ - alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \ ROUND_BIG8( alpha ); \ } while (0) #define T_BIG8 \ do { /* order is important */ \ - c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \ - c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \ - c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \ - c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \ - c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \ - c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \ - c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \ - c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \ + c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \ + c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \ + c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \ + c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \ + c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \ + c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \ + c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \ + c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \ } while (0) void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) @@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) WRITE_STATE_BIG8( sc ); } - void hamsi512_8way_init( hamsi_8way_big_context *sc ) { sc->partial_len = 0; @@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) #define INPUT_BIG \ do { \ __m256i db = *buf; \ + const __m256i zero = m256_zero; \ const uint64_t *tp = (const uint64_t*)T512; \ - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \ - for ( int u = 0; u < 64; u++ ) \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \ + for ( int u = 63; u >= 0; u-- ) \ { \ - __m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \ + __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ m256_const1_64( tp[0] ) ) ); \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ @@ -933,7 +905,6 @@ do { \ m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ m256_const1_64( tp[7] ) ) ); \ tp += 8; \ - db = _mm256_srli_epi64( db, 1 ); \ } \ } while (0) @@ -982,47 +953,28 @@ do { \ #define READ_STATE_BIG(sc) \ do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ + c0 = sc->h[0]; \ + c1 = sc->h[1]; \ + c2 = sc->h[2]; \ + c3 = sc->h[3]; \ + c4 = sc->h[4]; \ + c5 = sc->h[5]; \ + c6 = sc->h[6]; \ + c7 = sc->h[7]; \ } while (0) #define WRITE_STATE_BIG(sc) \ do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ + sc->h[0] = c0; \ + sc->h[1] = c1; \ + sc->h[2] = c2; \ + sc->h[3] = c3; \ + sc->h[4] = c4; \ + sc->h[5] = c5; \ + sc->h[6] = c6; \ + sc->h[7] = c7; \ } while (0) -/* -#define s0 m0 -#define s1 c0 -#define s2 m1 -#define s3 c1 -#define s4 c2 -#define s5 m2 -#define s6 c3 -#define s7 m3 -#define s8 m4 -#define s9 c4 -#define sA m5 -#define sB c5 -#define sC c6 -#define sD m6 -#define sE c7 -#define sF m7 -*/ - #define ROUND_BIG( alpha ) \ do { \ __m256i t0, t1, t2, t3; \ @@ -1048,151 +1000,145 @@ do { \ SBOX( s2, s6, sA, sE ); \ SBOX( s3, s7, sB, sF ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \ - _mm256_bslli_epi128( s5, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \ - _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ + s4 = mm256_swap64_32( s4 ); \ + s5 = mm256_swap64_32( s5 ); \ + sD = mm256_swap64_32( sD ); \ + sE = mm256_swap64_32( sE ); \ + t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \ + t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \ L( s0, t1, s9, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \ + s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \ + sD = _mm256_blend_epi32( sD, t3, 0x55 ); \ + sE = _mm256_blend_epi32( sE, t3, 0xaa ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( s6, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \ - _mm256_bslli_epi128( sF, 4 ), 0xAA ); \ + s6 = mm256_swap64_32( s6 ); \ + sF = mm256_swap64_32( sF ); \ + t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \ + t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \ L( s1, t1, sA, t3 ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \ + s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \ + sE = _mm256_blend_epi32( sE, t3, 0x55 ); \ + sF = _mm256_blend_epi32( sF, t3, 0xaa ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \ - _mm256_bslli_epi128( s7, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \ - _mm256_bslli_epi128( sC, 4 ), 0xAA ); \ + s7 = mm256_swap64_32( s7 ); \ + sC = mm256_swap64_32( sC ); \ + t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \ + t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \ L( s2, t1, sB, t3 ); \ - s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \ + s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \ + sF = _mm256_blend_epi32( sF, t3, 0x55 ); \ + sC = _mm256_blend_epi32( sC, t3, 0xaa ); \ + s6 = mm256_swap64_32( s6 ); \ + sF = mm256_swap64_32( sF ); \ \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \ - _mm256_bslli_epi128( s4, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ + t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \ + t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \ L( s3, t1, s8, t3 ); \ - s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ - s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ - sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ + s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \ + s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \ + sC = _mm256_blend_epi32( sC, t3, 0x55 ); \ + sD = _mm256_blend_epi32( sD, t3, 0xaa ); \ + s7 = mm256_swap64_32( s7 ); \ + sC = mm256_swap64_32( sC ); \ \ - t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \ - t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \ - t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \ - t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \ - _mm256_bslli_epi128( sB, 4 ), 0xAA ); \ + t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \ + t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \ + t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \ + t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \ + t3 = mm256_swap64_32( t3 ); \ L( t0, t1, t2, t3 ); \ + t3 = mm256_swap64_32( t3 ); \ s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \ - s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \ + s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \ s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \ - s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \ - s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \ - sA = _mm256_blend_epi32( sA, t2, 0xAA ); \ - s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \ - sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \ + s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \ + s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \ + sA = _mm256_blend_epi32( sA, t2, 0xaa ); \ + s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \ + sB = _mm256_blend_epi32( sB, t3, 0x55 ); \ \ - t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \ - t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ - _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ - t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ - t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \ + t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \ + t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \ + t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \ + t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \ L( t0, t1, t2, t3 ); \ - s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \ - sC = _mm256_blend_epi32( sC, t0, 0xAA ); \ - s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \ - sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \ + s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \ + sC = _mm256_blend_epi32( sC, t0, 0xaa ); \ + s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \ + sD = _mm256_blend_epi32( sD, t1, 0xaa ); \ s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \ - sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \ + sE = _mm256_blend_epi32( sE, t2, 0xaa ); \ s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \ - sF = _mm256_blend_epi32( sF, t3, 0xAA ); \ + sF = _mm256_blend_epi32( sF, t3, 0xaa ); \ + s4 = mm256_swap64_32( s4 ); \ + s5 = mm256_swap64_32( s5 ); \ + sD = mm256_swap64_32( sD ); \ + sE = mm256_swap64_32( sE ); \ } while (0) #define P_BIG \ do { \ __m256i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \ for( int i = 0; i < 16; i++ ) \ alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ - ^ ( (uint64_t*)alpha_n )[0] ); \ + alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ } while (0) #define PF_BIG \ do { \ __m256i alpha[16]; \ + const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \ for( int i = 0; i < 16; i++ ) \ alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ - alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \ - ^ ( (uint64_t*)alpha_f )[0] ); \ + alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \ ROUND_BIG( alpha ); \ } while (0) #define T_BIG \ do { /* order is important */ \ - c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \ - c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \ - c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \ - c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \ - c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \ - c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \ - c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \ - c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \ + c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \ + c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \ + c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \ + c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \ + c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \ + c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \ + c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \ + c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \ } while (0) void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index af37d6f..458201c 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -53,7 +53,8 @@ static const uint64_t RC[] = { #define WRITE_STATE(sc) #define MOV64(d, s) (d = s) -#define XOR64_IOTA XOR64 +#define XOR64_IOTA XOR + #define LPAR ( #define RPAR ) @@ -71,14 +72,15 @@ static const uint64_t RC[] = { // Targetted macros, keccak-macros.h is included for each target. #define DECL64(x) __m512i x -#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) +#define XOR(d, a, b) (d = _mm512_xor_si512(a,b)) +#define XOR64 XOR #define AND64(d, a, b) (d = _mm512_and_si512(a,b)) #define OR64(d, a, b) (d = _mm512_or_si512(a,b)) #define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) #define ROL64(d, v, n) (d = mm512_rol_64(v, n)) #define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) - +#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c )) #include "keccak-macros.c" @@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst) #undef INPUT_BUF #undef DECL64 #undef XOR64 +#undef XOR #undef AND64 #undef OR64 #undef NOT64 @@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst) #undef KECCAK_F_1600 #undef XOROR #undef XORAND - +#undef XOR3 #endif // AVX512 // AVX2 @@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst) } while (0) #define DECL64(x) __m256i x -#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) +#define XOR(d, a, b) (d = _mm256_xor_si256(a,b)) +#define XOR64 XOR #define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n)) #define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c))) +#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c )) #include "keccak-macros.c" @@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst) #undef INPUT_BUF #undef DECL64 #undef XOR64 +#undef XOR #undef AND64 #undef OR64 #undef NOT64 @@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst) #undef KECCAK_F_1600 #undef XOROR #undef XORAND +#undef XOR3 #endif // AVX2 diff --git a/algo/keccak/keccak-macros.c b/algo/keccak/keccak-macros.c index 436d1ca..6b7776d 100644 --- a/algo/keccak/keccak-macros.c +++ b/algo/keccak/keccak-macros.c @@ -1,6 +1,19 @@ #ifdef TH_ELT #undef TH_ELT #endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + XOR3( tt0, d0, d1, d4 ); \ + XOR( tt1, d2, d3 ); \ + XOR( tt0, tt0, tt1 ); \ + ROL64( tt0, tt0, 1 ); \ + XOR3( tt1, c0, c1, c4 ); \ + XOR3( tt0, tt0, c2, c3 ); \ + XOR( t, tt0, tt1 ); \ +} while (0) +/* #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ DECL64(tt0); \ DECL64(tt1); \ @@ -17,7 +30,7 @@ XOR64(tt2, tt2, tt3); \ XOR64(t, tt0, tt2); \ } while (0) - +*/ #ifdef THETA #undef THETA #endif diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index 5557ca3..c36411b 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -34,6 +34,7 @@ #include "algo/sha/sha-hash-4way.h" #include "algo/sha/sha256-hash.h" #include +#include "malloc-huge.h" static const uint32_t keypad[12] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 @@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, bool scrypt_miner_thread_init( int thr_id ) { - scratchbuf = _mm_malloc( scratchbuf_size, 128 ); + scratchbuf = malloc_hugepages( scratchbuf_size ); if ( scratchbuf ) - return true; + { + if ( opt_debug ) + applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id ); + } + else + scratchbuf = _mm_malloc( scratchbuf_size, 128 ); + + if ( scratchbuf ) return true; + applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); - return false; + return false; } bool register_scrypt_algo( algo_gate_t* gate ) diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 8225595..06116ff 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -62,8 +62,8 @@ extern "C"{ #if defined(__AVX2__) #define DECL_STATE8 \ - __m256i A00, A01, A02, A03, A04, A05, A06, A07, \ - A08, A09, A0A, A0B; \ + __m256i A0, A1, A2, A3, A4, A5, A6, A7, \ + A8, A9, AA, AB; \ __m256i B0, B1, B2, B3, B4, B5, B6, B7, \ B8, B9, BA, BB, BC, BD, BE, BF; \ __m256i C0, C1, C2, C3, C4, C5, C6, C7, \ @@ -78,18 +78,18 @@ extern "C"{ { \ if ( (state)->state_loaded ) \ { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ + A0 = (state)->A[0]; \ + A1 = (state)->A[1]; \ + A2 = (state)->A[2]; \ + A3 = (state)->A[3]; \ + A4 = (state)->A[4]; \ + A5 = (state)->A[5]; \ + A6 = (state)->A[6]; \ + A7 = (state)->A[7]; \ + A8 = (state)->A[8]; \ + A9 = (state)->A[9]; \ + AA = (state)->A[10]; \ + AB = (state)->A[11]; \ B0 = (state)->B[0]; \ B1 = (state)->B[1]; \ B2 = (state)->B[2]; \ @@ -126,18 +126,18 @@ extern "C"{ else \ { \ (state)->state_loaded = true; \ - A00 = m256_const1_64( 0x20728DFD20728DFD ); \ - A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ - A02 = m256_const1_64( 0xE782B699E782B699 ); \ - A03 = m256_const1_64( 0x5530463255304632 ); \ - A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ - A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ - A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ - A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ - A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ - A09 = m256_const1_64( 0x8BD144108BD14410 ); \ - A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ - A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ + A0 = m256_const1_64( 0x20728DFD20728DFD ); \ + A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ + A2 = m256_const1_64( 0xE782B699E782B699 ); \ + A3 = m256_const1_64( 0x5530463255304632 ); \ + A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ + A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ + A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ + A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ + A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A9 = m256_const1_64( 0x8BD144108BD14410 ); \ + AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ + AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \ B1 = m256_const1_64( 0x07B385F307B385F3 ); \ B2 = m256_const1_64( 0xE7442C26E7442C26 ); \ @@ -176,18 +176,18 @@ extern "C"{ } while (0) #define WRITE_STATE8(state) do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ + (state)->A[0] = A0; \ + (state)->A[1] = A1; \ + (state)->A[2] = A2; \ + (state)->A[3] = A3; \ + (state)->A[4] = A4; \ + (state)->A[5] = A5; \ + (state)->A[6] = A6; \ + (state)->A[7] = A7; \ + (state)->A[8] = A8; \ + (state)->A[9] = A9; \ + (state)->A[10] = AA; \ + (state)->A[11] = AB; \ (state)->B[0] = B0; \ (state)->B[1] = B1; \ (state)->B[2] = B2; \ @@ -286,8 +286,8 @@ do { \ #define XOR_W8 \ do { \ - A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \ - A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \ + A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \ + A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \ } while (0) #define SWAP_BC8 \ @@ -321,60 +321,60 @@ do { \ } while (0) #define PERM_STEP_0_8 do { \ - PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_1_8 do { \ - PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_2_8 do { \ - PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \ } while (0) #define APPLY_P8 \ @@ -398,42 +398,42 @@ do { \ PERM_STEP_0_8; \ PERM_STEP_1_8; \ PERM_STEP_2_8; \ - A0B = _mm256_add_epi32( A0B, C6 ); \ - A0A = _mm256_add_epi32( A0A, C5 ); \ - A09 = _mm256_add_epi32( A09, C4 ); \ - A08 = _mm256_add_epi32( A08, C3 ); \ - A07 = _mm256_add_epi32( A07, C2 ); \ - A06 = _mm256_add_epi32( A06, C1 ); \ - A05 = _mm256_add_epi32( A05, C0 ); \ - A04 = _mm256_add_epi32( A04, CF ); \ - A03 = _mm256_add_epi32( A03, CE ); \ - A02 = _mm256_add_epi32( A02, CD ); \ - A01 = _mm256_add_epi32( A01, CC ); \ - A00 = _mm256_add_epi32( A00, CB ); \ - A0B = _mm256_add_epi32( A0B, CA ); \ - A0A = _mm256_add_epi32( A0A, C9 ); \ - A09 = _mm256_add_epi32( A09, C8 ); \ - A08 = _mm256_add_epi32( A08, C7 ); \ - A07 = _mm256_add_epi32( A07, C6 ); \ - A06 = _mm256_add_epi32( A06, C5 ); \ - A05 = _mm256_add_epi32( A05, C4 ); \ - A04 = _mm256_add_epi32( A04, C3 ); \ - A03 = _mm256_add_epi32( A03, C2 ); \ - A02 = _mm256_add_epi32( A02, C1 ); \ - A01 = _mm256_add_epi32( A01, C0 ); \ - A00 = _mm256_add_epi32( A00, CF ); \ - A0B = _mm256_add_epi32( A0B, CE ); \ - A0A = _mm256_add_epi32( A0A, CD ); \ - A09 = _mm256_add_epi32( A09, CC ); \ - A08 = _mm256_add_epi32( A08, CB ); \ - A07 = _mm256_add_epi32( A07, CA ); \ - A06 = _mm256_add_epi32( A06, C9 ); \ - A05 = _mm256_add_epi32( A05, C8 ); \ - A04 = _mm256_add_epi32( A04, C7 ); \ - A03 = _mm256_add_epi32( A03, C6 ); \ - A02 = _mm256_add_epi32( A02, C5 ); \ - A01 = _mm256_add_epi32( A01, C4 ); \ - A00 = _mm256_add_epi32( A00, C3 ); \ + AB = _mm256_add_epi32( AB, C6 ); \ + AA = _mm256_add_epi32( AA, C5 ); \ + A9 = _mm256_add_epi32( A9, C4 ); \ + A8 = _mm256_add_epi32( A8, C3 ); \ + A7 = _mm256_add_epi32( A7, C2 ); \ + A6 = _mm256_add_epi32( A6, C1 ); \ + A5 = _mm256_add_epi32( A5, C0 ); \ + A4 = _mm256_add_epi32( A4, CF ); \ + A3 = _mm256_add_epi32( A3, CE ); \ + A2 = _mm256_add_epi32( A2, CD ); \ + A1 = _mm256_add_epi32( A1, CC ); \ + A0 = _mm256_add_epi32( A0, CB ); \ + AB = _mm256_add_epi32( AB, CA ); \ + AA = _mm256_add_epi32( AA, C9 ); \ + A9 = _mm256_add_epi32( A9, C8 ); \ + A8 = _mm256_add_epi32( A8, C7 ); \ + A7 = _mm256_add_epi32( A7, C6 ); \ + A6 = _mm256_add_epi32( A6, C5 ); \ + A5 = _mm256_add_epi32( A5, C4 ); \ + A4 = _mm256_add_epi32( A4, C3 ); \ + A3 = _mm256_add_epi32( A3, C2 ); \ + A2 = _mm256_add_epi32( A2, C1 ); \ + A1 = _mm256_add_epi32( A1, C0 ); \ + A0 = _mm256_add_epi32( A0, CF ); \ + AB = _mm256_add_epi32( AB, CE ); \ + AA = _mm256_add_epi32( AA, CD ); \ + A9 = _mm256_add_epi32( A9, CC ); \ + A8 = _mm256_add_epi32( A8, CB ); \ + A7 = _mm256_add_epi32( A7, CA ); \ + A6 = _mm256_add_epi32( A6, C9 ); \ + A5 = _mm256_add_epi32( A5, C8 ); \ + A4 = _mm256_add_epi32( A4, C7 ); \ + A3 = _mm256_add_epi32( A3, C6 ); \ + A2 = _mm256_add_epi32( A2, C5 ); \ + A1 = _mm256_add_epi32( A1, C4 ); \ + A0 = _mm256_add_epi32( A0, C3 ); \ } while (0) #define INCR_W8 do { \ @@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define DECL_STATE \ - __m128i A00, A01, A02, A03, A04, A05, A06, A07, \ - A08, A09, A0A, A0B; \ + __m128i A0, A1, A2, A3, A4, A5, A6, A7, \ + A8, A9, AA, AB; \ __m128i B0, B1, B2, B3, B4, B5, B6, B7, \ B8, B9, BA, BB, BC, BD, BE, BF; \ __m128i C0, C1, C2, C3, C4, C5, C6, C7, \ @@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { \ if ( (state)->state_loaded ) \ { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ + A0 = (state)->A[0]; \ + A1 = (state)->A[1]; \ + A2 = (state)->A[2]; \ + A3 = (state)->A[3]; \ + A4 = (state)->A[4]; \ + A5 = (state)->A[5]; \ + A6 = (state)->A[6]; \ + A7 = (state)->A[7]; \ + A8 = (state)->A[8]; \ + A9 = (state)->A[9]; \ + AA = (state)->A[10]; \ + AB = (state)->A[11]; \ B0 = (state)->B[0]; \ B1 = (state)->B[1]; \ B2 = (state)->B[2]; \ @@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) else \ { \ (state)->state_loaded = true; \ - A00 = m128_const1_64( 0x20728DFD20728DFD ); \ - A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ - A02 = m128_const1_64( 0xE782B699E782B699 ); \ - A03 = m128_const1_64( 0x5530463255304632 ); \ - A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ - A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ - A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ - A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ - A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ - A09 = m128_const1_64( 0x8BD144108BD14410 ); \ - A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ - A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ + A0 = m128_const1_64( 0x20728DFD20728DFD ); \ + A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ + A2 = m128_const1_64( 0xE782B699E782B699 ); \ + A3 = m128_const1_64( 0x5530463255304632 ); \ + A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ + A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ + A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ + A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ + A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A9 = m128_const1_64( 0x8BD144108BD14410 ); \ + AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ + AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \ B1 = m128_const1_64( 0x07B385F307B385F3 ); \ B2 = m128_const1_64( 0xE7442C26E7442C26 ); \ @@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) } while (0) #define WRITE_STATE(state) do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ + (state)->A[0] = A0; \ + (state)->A[1] = A1; \ + (state)->A[2] = A2; \ + (state)->A[3] = A3; \ + (state)->A[4] = A4; \ + (state)->A[5] = A5; \ + (state)->A[6] = A6; \ + (state)->A[7] = A7; \ + (state)->A[8] = A8; \ + (state)->A[9] = A9; \ + (state)->A[10] = AA; \ + (state)->A[11] = AB; \ (state)->B[0] = B0; \ (state)->B[1] = B1; \ (state)->B[2] = B2; \ @@ -884,8 +884,8 @@ do { \ #define XOR_W \ do { \ - A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \ - A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \ + A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \ + A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \ } while (0) @@ -940,60 +940,60 @@ do { \ } while (0) #define PERM_STEP_0 do { \ - PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_1 do { \ - PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \ } while (0) #define PERM_STEP_2 do { \ - PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \ } while (0) #define APPLY_P \ @@ -1017,42 +1017,42 @@ do { \ PERM_STEP_0; \ PERM_STEP_1; \ PERM_STEP_2; \ - A0B = _mm_add_epi32( A0B, C6 ); \ - A0A = _mm_add_epi32( A0A, C5 ); \ - A09 = _mm_add_epi32( A09, C4 ); \ - A08 = _mm_add_epi32( A08, C3 ); \ - A07 = _mm_add_epi32( A07, C2 ); \ - A06 = _mm_add_epi32( A06, C1 ); \ - A05 = _mm_add_epi32( A05, C0 ); \ - A04 = _mm_add_epi32( A04, CF ); \ - A03 = _mm_add_epi32( A03, CE ); \ - A02 = _mm_add_epi32( A02, CD ); \ - A01 = _mm_add_epi32( A01, CC ); \ - A00 = _mm_add_epi32( A00, CB ); \ - A0B = _mm_add_epi32( A0B, CA ); \ - A0A = _mm_add_epi32( A0A, C9 ); \ - A09 = _mm_add_epi32( A09, C8 ); \ - A08 = _mm_add_epi32( A08, C7 ); \ - A07 = _mm_add_epi32( A07, C6 ); \ - A06 = _mm_add_epi32( A06, C5 ); \ - A05 = _mm_add_epi32( A05, C4 ); \ - A04 = _mm_add_epi32( A04, C3 ); \ - A03 = _mm_add_epi32( A03, C2 ); \ - A02 = _mm_add_epi32( A02, C1 ); \ - A01 = _mm_add_epi32( A01, C0 ); \ - A00 = _mm_add_epi32( A00, CF ); \ - A0B = _mm_add_epi32( A0B, CE ); \ - A0A = _mm_add_epi32( A0A, CD ); \ - A09 = _mm_add_epi32( A09, CC ); \ - A08 = _mm_add_epi32( A08, CB ); \ - A07 = _mm_add_epi32( A07, CA ); \ - A06 = _mm_add_epi32( A06, C9 ); \ - A05 = _mm_add_epi32( A05, C8 ); \ - A04 = _mm_add_epi32( A04, C7 ); \ - A03 = _mm_add_epi32( A03, C6 ); \ - A02 = _mm_add_epi32( A02, C5 ); \ - A01 = _mm_add_epi32( A01, C4 ); \ - A00 = _mm_add_epi32( A00, C3 ); \ + AB = _mm_add_epi32( AB, C6 ); \ + AA = _mm_add_epi32( AA, C5 ); \ + A9 = _mm_add_epi32( A9, C4 ); \ + A8 = _mm_add_epi32( A8, C3 ); \ + A7 = _mm_add_epi32( A7, C2 ); \ + A6 = _mm_add_epi32( A6, C1 ); \ + A5 = _mm_add_epi32( A5, C0 ); \ + A4 = _mm_add_epi32( A4, CF ); \ + A3 = _mm_add_epi32( A3, CE ); \ + A2 = _mm_add_epi32( A2, CD ); \ + A1 = _mm_add_epi32( A1, CC ); \ + A0 = _mm_add_epi32( A0, CB ); \ + AB = _mm_add_epi32( AB, CA ); \ + AA = _mm_add_epi32( AA, C9 ); \ + A9 = _mm_add_epi32( A9, C8 ); \ + A8 = _mm_add_epi32( A8, C7 ); \ + A7 = _mm_add_epi32( A7, C6 ); \ + A6 = _mm_add_epi32( A6, C5 ); \ + A5 = _mm_add_epi32( A5, C4 ); \ + A4 = _mm_add_epi32( A4, C3 ); \ + A3 = _mm_add_epi32( A3, C2 ); \ + A2 = _mm_add_epi32( A2, C1 ); \ + A1 = _mm_add_epi32( A1, C0 ); \ + A0 = _mm_add_epi32( A0, CF ); \ + AB = _mm_add_epi32( AB, CE ); \ + AA = _mm_add_epi32( AA, CD ); \ + A9 = _mm_add_epi32( A9, CC ); \ + A8 = _mm_add_epi32( A8, CB ); \ + A7 = _mm_add_epi32( A7, CA ); \ + A6 = _mm_add_epi32( A6, C9 ); \ + A5 = _mm_add_epi32( A5, C8 ); \ + A4 = _mm_add_epi32( A4, C7 ); \ + A3 = _mm_add_epi32( A3, C6 ); \ + A2 = _mm_add_epi32( A2, C5 ); \ + A1 = _mm_add_epi32( A1, C4 ); \ + A0 = _mm_add_epi32( A0, C3 ); \ } while (0) #define INCR_W do { \ diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c index 8880b45..2952112 100644 --- a/algo/verthash/Verthash.c +++ b/algo/verthash/Verthash.c @@ -10,6 +10,7 @@ #include "algo-gate-api.h" #include "Verthash.h" #include "mm_malloc.h" +#include "malloc-huge.h" //----------------------------------------------------------------------------- // Verthash info management @@ -84,12 +85,18 @@ int verthash_info_init(verthash_info_t* info, const char* file_name) } // Allocate data - info->data = (uint8_t *)_mm_malloc( fileSize, 64 ); - if (!info->data) + info->data = (uint8_t *)malloc_hugepages( fileSize ); + if ( info->data ) + if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages"); + else { - fclose(fileMiningData); - // Memory allocation fatal error. - return 2; + info->data = (uint8_t *)_mm_malloc( fileSize, 64 ); + if (!info->data) + { + fclose(fileMiningData); + // Memory allocation fatal error. + return 2; + } } // Load data diff --git a/algo/verthash/tiny_sha3/sha3-4way.c b/algo/verthash/tiny_sha3/sha3-4way.c index abbc848..debbd77 100644 --- a/algo/verthash/tiny_sha3/sha3-4way.c +++ b/algo/verthash/tiny_sha3/sha3-4way.c @@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] ) for ( r = 0; r < KECCAKF_ROUNDS; r++ ) { // Theta - bc[0] = _mm256_xor_si256( st[0], - mm256_xor4( st[5], st[10], st[15], st[20] ) ); - bc[1] = _mm256_xor_si256( st[1], - mm256_xor4( st[6], st[11], st[16], st[21] ) ); - bc[2] = _mm256_xor_si256( st[2], - mm256_xor4( st[7], st[12], st[17], st[22] ) ); - bc[3] = _mm256_xor_si256( st[3], - mm256_xor4( st[8], st[13], st[18], st[23] ) ); - bc[4] = _mm256_xor_si256( st[4], - mm256_xor4( st[9], st[14], st[19], st[24] ) ); + bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) ); + bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) ); + bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) ); + bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) ); + bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) ); for ( i = 0; i < 5; i++ ) { @@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] ) // Chi for ( j = 0; j < 25; j += 5 ) { - memcpy( bc, &st[ j ], 5*32 ); - st[ j ] = _mm256_xor_si256( st[ j ], - _mm256_andnot_si256( bc[1], bc[2] ) ); - st[ j+1 ] = _mm256_xor_si256( st[ j+1 ], - _mm256_andnot_si256( bc[2], bc[3] ) ); - st[ j+2 ] = _mm256_xor_si256( st[ j+2 ], - _mm256_andnot_si256( bc[3], bc[4] ) ); - st[ j+3 ] = _mm256_xor_si256( st[ j+3 ], - _mm256_andnot_si256( bc[4], bc[0] ) ); - st[ j+4 ] = _mm256_xor_si256( st[ j+4 ], - _mm256_andnot_si256( bc[0], bc[1] ) ); + bc[0] = st[j]; + bc[1] = st[j+1]; + st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] ); + st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] ); + st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] ); + st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] ); + st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] ); } // Iota diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index ec808f6..eeb2e5d 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate ) { opt_target_factor = 256.0; gate->scanhash = (void*)&scanhash_verthash; - gate->optimizations = AVX2_OPT; + gate->optimizations = SSE42_OPT | AVX2_OPT; const char *verthash_data_file = opt_data_file ? opt_data_file : default_verthash_data_file; diff --git a/configure b/configure index f678bda..e76f139 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.19.2' -PACKAGE_STRING='cpuminer-opt 3.19.2' +PACKAGE_VERSION='3.19.3' +PACKAGE_STRING='cpuminer-opt 3.19.3' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.19.2 +cpuminer-opt configure 3.19.3 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.19.2, which was +It was created by cpuminer-opt $as_me 3.19.3, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.19.2' + VERSION='3.19.3' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.19.2, which was +This file was extended by cpuminer-opt $as_me 3.19.3, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.19.2 +cpuminer-opt config.status 3.19.3 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 314b0d5..2b17493 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.19.2]) +AC_INIT([cpuminer-opt], [3.19.3]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/malloc-huge.c b/malloc-huge.c new file mode 100644 index 0000000..75c0165 --- /dev/null +++ b/malloc-huge.c @@ -0,0 +1,36 @@ +#include "malloc-huge.h" +#include "miner.h" + +#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024) + +void *malloc_hugepages( size_t size ) +{ +#if !(defined(MAP_HUGETLB) && defined(MAP_ANON)) +// applog( LOG_WARNING, "Huge pages not available",size); + return NULL; +#else + + if ( size < HUGEPAGE_MIN_ALLOC ) + { +// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size); + return NULL; + } + + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1; + void *p = NULL; + int flags = + #ifdef MAP_NOCORE + MAP_NOCORE | + #endif + MAP_HUGETLB | MAP_ANON | MAP_PRIVATE; + + // round size up to next page boundary + size = ( size + hugepage_mask ) & (~hugepage_mask); + + p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 ); + if ( p == MAP_FAILED ) + p = NULL; + return p; +#endif +} + diff --git a/malloc-huge.h b/malloc-huge.h new file mode 100644 index 0000000..371e09a --- /dev/null +++ b/malloc-huge.h @@ -0,0 +1,24 @@ +#if !(defined(MALLOC_HUGE__)) +#define MALLOC_HUGE__ + +#include +#include +#include +#include + +#ifdef __unix__ +#include +#endif + +#if defined(MAP_HUGETLB) + +// Minimum block size 6 MiB to use huge pages +#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024) + +#endif + +// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure. +void *malloc_hugepages( size_t size ); + +#endif + diff --git a/winbuild-cross.sh b/winbuild-cross.sh index ec73859..5774430 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ ./clean-all.sh || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS +CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe @@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS +CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512.exe @@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe # AVX2 SHA VAES: Intel Alderlake, AMD Zen3 make clean || echo done rm -f config.status -CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS +CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe @@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe # AVX2 AES SHA: AMD Zen1 make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS +CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2-sha.exe @@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS +CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe @@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe # AVX AES: Intel Sandybridge, Ivybridge make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS +CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx.exe @@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe # SSE4.2 AES: Intel Westmere make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS +CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-aes-sse42.exe @@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Generic SSE2 make clean || echo clean rm -f config.status -CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS +CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-sse2.exe make clean || echo clean +# Native with CPU groups ennabled +make clean || echo clean +rm -f config.status +CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +make -j 8 +strip -s cpuminer.exe +