This commit is contained in:
Jay D Dee
2022-01-07 12:07:38 -05:00
parent 0e3945ddb5
commit 17ccbc328f
17 changed files with 657 additions and 606 deletions

View File

@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
api.c \ api.c \
sysinfos.c \ sysinfos.c \
algo-gate-api.c\ algo-gate-api.c\
malloc-huge.c \
algo/argon2/argon2a/argon2a.c \ algo/argon2/argon2a/argon2a.c \
algo/argon2/argon2a/ar2/argon2.c \ algo/argon2/argon2a/ar2/argon2.c \
algo/argon2/argon2a/ar2/opt.c \ algo/argon2/argon2a/ar2/opt.c \

View File

@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v3.19.3
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
v3.19.2 v3.19.2
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1. Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.

View File

@@ -37,6 +37,13 @@
#if defined(__AVX512F__) #if defined(__AVX512F__)
static inline __m512i blamka( __m512i x, __m512i y )
{
__m512i xy = _mm512_mul_epu32( x, y );
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
_mm512_add_epi64( xy, xy ) );
}
static void fill_block( __m512i *state, const block *ref_block, static void fill_block( __m512i *state, const block *ref_block,
block *next_block, int with_xor ) block *next_block, int with_xor )
{ {

View File

@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h> #include <immintrin.h>
#define ROR64(x, n) _mm512_ror_epi64((x), (n)) static inline __m512i muladd(__m512i x, __m512i y)
static __m512i muladd(__m512i x, __m512i y)
{ {
__m512i z = _mm512_mul_epu32(x, y); __m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \ D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \ D1 = _mm512_xor_si512(D1, A1); \
\ \
D0 = ROR64(D0, 32); \ D0 = _mm512_ror_epi64(D0, 32); \
D1 = ROR64(D1, 32); \ D1 = _mm512_ror_epi64(D1, 32); \
\ \
C0 = muladd(C0, D0); \ C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \ C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \ B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \ B1 = _mm512_xor_si512(B1, C1); \
\ \
B0 = ROR64(B0, 24); \ B0 = _mm512_ror_epi64(B0, 24); \
B1 = ROR64(B1, 24); \ B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0) } while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \ D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \ D1 = _mm512_xor_si512(D1, A1); \
\ \
D0 = ROR64(D0, 16); \ D0 = _mm512_ror_epi64(D0, 16); \
D1 = ROR64(D1, 16); \ D1 = _mm512_ror_epi64(D1, 16); \
\ \
C0 = muladd(C0, D0); \ C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \ C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \ B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \ B1 = _mm512_xor_si512(B1, C1); \
\ \
B0 = ROR64(B0, 63); \ B0 = _mm512_ror_epi64(B0, 63); \
B1 = ROR64(B1, 63); \ B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0) } while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
#define SWAP_HALVES(A0, A1) \ #define SWAP_HALVES(A0, A1) \
do { \ do { \
__m512i t0, t1; \ __m512i t; \
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t0; \ A0 = t; \
A1 = t1; \
} while((void)0, 0) } while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \ #define SWAP_QUARTERS(A0, A1) \

View File

@@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = {
#define sE c7 #define sE c7
#define sF m7 #define sF m7
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way AVX512 // Hamsi 8 way AVX512
// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero // Intel says _mm512_movepi64_mask has (1L/1T) timimg while
// produces the same result but is faster. // _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
// on i9-9940x cmplt with zero was 3% faster than movepi.
#define INPUT_BIG8 \ #define INPUT_BIG8 \
do { \ do { \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \ __m512i db = _mm512_ror_epi64( *buf, 1 ); \
const __m512i zero = m512_zero; \
const uint64_t *tp = (const uint64_t*)T512; \ const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \ for ( int u = 0; u < 64; u++ ) \
{ \ { \
__mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \ const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \ m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \ m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \ m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
@@ -573,29 +575,6 @@ do { \
} \ } \
} while (0) } while (0)
/*
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
*/
#define SBOX8( a, b, c, d ) \ #define SBOX8( a, b, c, d ) \
do { \ do { \
__m512i t; \ __m512i t; \
@@ -632,199 +611,192 @@ do { \
#define READ_STATE_BIG8(sc) \ #define READ_STATE_BIG8(sc) \
do { \ do { \
c0 = sc->h[0x0]; \ c0 = sc->h[0]; \
c1 = sc->h[0x1]; \ c1 = sc->h[1]; \
c2 = sc->h[0x2]; \ c2 = sc->h[2]; \
c3 = sc->h[0x3]; \ c3 = sc->h[3]; \
c4 = sc->h[0x4]; \ c4 = sc->h[4]; \
c5 = sc->h[0x5]; \ c5 = sc->h[5]; \
c6 = sc->h[0x6]; \ c6 = sc->h[6]; \
c7 = sc->h[0x7]; \ c7 = sc->h[7]; \
} while (0) } while (0)
#define WRITE_STATE_BIG8(sc) \ #define WRITE_STATE_BIG8(sc) \
do { \ do { \
sc->h[0x0] = c0; \ sc->h[0] = c0; \
sc->h[0x1] = c1; \ sc->h[1] = c1; \
sc->h[0x2] = c2; \ sc->h[2] = c2; \
sc->h[0x3] = c3; \ sc->h[3] = c3; \
sc->h[0x4] = c4; \ sc->h[4] = c4; \
sc->h[0x5] = c5; \ sc->h[5] = c5; \
sc->h[0x6] = c6; \ sc->h[6] = c6; \
sc->h[0x7] = c7; \ sc->h[7] = c7; \
} while (0) } while (0)
#define ROUND_BIG8( alpha ) \ #define ROUND_BIG8( alpha ) \
do { \ do { \
__m512i t0, t1, t2, t3; \ __m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \ s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \ s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \ s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \ s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \ s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \ s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \ s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \ s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \ s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \ s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
sA = _mm512_xor_si512( sA, alpha[10] ); \ sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
sB = _mm512_xor_si512( sB, alpha[11] ); \ sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
sC = _mm512_xor_si512( sC, alpha[12] ); \ sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
sD = _mm512_xor_si512( sD, alpha[13] ); \ sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
sE = _mm512_xor_si512( sE, alpha[14] ); \ sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
sF = _mm512_xor_si512( sF, alpha[15] ); \ sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
\ \
SBOX8( s0, s4, s8, sC ); \ SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
SBOX8( s1, s5, s9, sD ); \ SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
SBOX8( s2, s6, sA, sE ); \ SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); \ SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
\ \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \ s4 = mm512_swap64_32( s4 ); \
_mm512_bslli_epi128( s5, 4 ) ); \ s5 = mm512_swap64_32( s5 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \ sD = mm512_swap64_32( sD ); \
_mm512_bslli_epi128( sE, 4 ) ); \ sE = mm512_swap64_32( sE ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t1, s9, t3 ); \ L8( s0, t1, s9, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \ s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \ s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \ sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \ sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
\ \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ s6 = mm512_swap64_32( s6 ); \
_mm512_bslli_epi128( s6, 4 ) ); \ sF = mm512_swap64_32( sF ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
_mm512_bslli_epi128( sF, 4 ) ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
L8( s1, t1, sA, t3 ); \ L8( s1, t1, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \ s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \ sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \ sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
\ \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \ s7 = mm512_swap64_32( s7 ); \
_mm512_bslli_epi128( s7, 4 ) ); \ sC = mm512_swap64_32( sC ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
_mm512_bslli_epi128( sC, 4 ) ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t1, sB, t3 ); \ L8( s2, t1, sB, t3 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \ s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \ s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \ sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \ sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\ \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
_mm512_bslli_epi128( s4, 4 ) ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
L8( s3, t1, s8, t3 ); \ L8( s3, t1, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \ s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \ s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \ sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \ sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \ t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \ t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
_mm512_bslli_epi128( sB, 4 ) ); \ t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \ L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \ s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \ s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \ s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \ sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
_mm512_bslli_epi128( sD, 4 ) ); \ t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
L8( t0, t1, t2, t3 ); \ L8( t0, t1, t2, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \ s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \ sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \ sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \ sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
} while (0) } while (0)
#define P_BIG8 \ #define P_BIG8 \
do { \ do { \
__m512i alpha[16]; \ __m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \ for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \ alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
} while (0) } while (0)
#define PF_BIG8 \ #define PF_BIG8 \
do { \ do { \
__m512i alpha[16]; \ __m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \ for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \ alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \ alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \ alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \ alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \ alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \ alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \ alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \ alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \ alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \ alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \ alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \ alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG8( alpha ); \ ROUND_BIG8( alpha ); \
} while (0) } while (0)
#define T_BIG8 \ #define T_BIG8 \
do { /* order is important */ \ do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \ c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \ c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \ c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \ c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \ c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \ c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \ c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \ c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
} while (0) } while (0)
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc ); WRITE_STATE_BIG8( sc );
} }
void hamsi512_8way_init( hamsi_8way_big_context *sc ) void hamsi512_8way_init( hamsi_8way_big_context *sc )
{ {
sc->partial_len = 0; sc->partial_len = 0;
@@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
#define INPUT_BIG \ #define INPUT_BIG \
do { \ do { \
__m256i db = *buf; \ __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \ const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \ m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \ for ( int u = 63; u >= 0; u-- ) \
{ \ { \
__m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \ __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \ m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -933,7 +905,6 @@ do { \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \ m256_const1_64( tp[7] ) ) ); \
tp += 8; \ tp += 8; \
db = _mm256_srli_epi64( db, 1 ); \
} \ } \
} while (0) } while (0)
@@ -982,47 +953,28 @@ do { \
#define READ_STATE_BIG(sc) \ #define READ_STATE_BIG(sc) \
do { \ do { \
c0 = sc->h[0x0]; \ c0 = sc->h[0]; \
c1 = sc->h[0x1]; \ c1 = sc->h[1]; \
c2 = sc->h[0x2]; \ c2 = sc->h[2]; \
c3 = sc->h[0x3]; \ c3 = sc->h[3]; \
c4 = sc->h[0x4]; \ c4 = sc->h[4]; \
c5 = sc->h[0x5]; \ c5 = sc->h[5]; \
c6 = sc->h[0x6]; \ c6 = sc->h[6]; \
c7 = sc->h[0x7]; \ c7 = sc->h[7]; \
} while (0) } while (0)
#define WRITE_STATE_BIG(sc) \ #define WRITE_STATE_BIG(sc) \
do { \ do { \
sc->h[0x0] = c0; \ sc->h[0] = c0; \
sc->h[0x1] = c1; \ sc->h[1] = c1; \
sc->h[0x2] = c2; \ sc->h[2] = c2; \
sc->h[0x3] = c3; \ sc->h[3] = c3; \
sc->h[0x4] = c4; \ sc->h[4] = c4; \
sc->h[0x5] = c5; \ sc->h[5] = c5; \
sc->h[0x6] = c6; \ sc->h[6] = c6; \
sc->h[0x7] = c7; \ sc->h[7] = c7; \
} while (0) } while (0)
/*
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
*/
#define ROUND_BIG( alpha ) \ #define ROUND_BIG( alpha ) \
do { \ do { \
__m256i t0, t1, t2, t3; \ __m256i t0, t1, t2, t3; \
@@ -1048,151 +1000,145 @@ do { \
SBOX( s2, s6, sA, sE ); \ SBOX( s2, s6, sA, sE ); \
SBOX( s3, s7, sB, sF ); \ SBOX( s3, s7, sB, sF ); \
\ \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \ s4 = mm256_swap64_32( s4 ); \
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \ s5 = mm256_swap64_32( s5 ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \ sD = mm256_swap64_32( sD ); \
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \ sE = mm256_swap64_32( sE ); \
t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
L( s0, t1, s9, t3 ); \ L( s0, t1, s9, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
\ \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ s6 = mm256_swap64_32( s6 ); \
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \ sF = mm256_swap64_32( sF ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \ t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \ t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
L( s1, t1, sA, t3 ); \ L( s1, t1, sA, t3 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
\ \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \ s7 = mm256_swap64_32( s7 ); \
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \ sC = mm256_swap64_32( sC ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \ t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \ t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
L( s2, t1, sB, t3 ); \ L( s2, t1, sB, t3 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
\ \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \ t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \ t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
L( s3, t1, s8, t3 ); \ L( s3, t1, s8, t3 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
\ \
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \ t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \ t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \ t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \ t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \ t3 = mm256_swap64_32( t3 ); \
L( t0, t1, t2, t3 ); \ L( t0, t1, t2, t3 ); \
t3 = mm256_swap64_32( t3 ); \
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \ s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \ s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \ s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \ s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \ s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \ sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \ s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \ sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
\ \
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \ t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \ t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
L( t0, t1, t2, t3 ); \ L( t0, t1, t2, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \ s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \ sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \ s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \ sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \ s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \ sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \ s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \ sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
} while (0) } while (0)
#define P_BIG \ #define P_BIG \
do { \ do { \
__m256i alpha[16]; \ __m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \ for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \ alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_n )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
} while (0) } while (0)
#define PF_BIG \ #define PF_BIG \
do { \ do { \
__m256i alpha[16]; \ __m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \ for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \ alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \ alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \ alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \ alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \ alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \ alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \ alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \ alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \ alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \ alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \ alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \ alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
^ ( (uint64_t*)alpha_f )[0] ); \
ROUND_BIG( alpha ); \ ROUND_BIG( alpha ); \
} while (0) } while (0)
#define T_BIG \ #define T_BIG \
do { /* order is important */ \ do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \ c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \ c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \ c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \ c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \ c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \ c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \ c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \ c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
} while (0) } while (0)
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )

View File

@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
#define WRITE_STATE(sc) #define WRITE_STATE(sc)
#define MOV64(d, s) (d = s) #define MOV64(d, s) (d = s)
#define XOR64_IOTA XOR64 #define XOR64_IOTA XOR
#define LPAR ( #define LPAR (
#define RPAR ) #define RPAR )
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
// Targetted macros, keccak-macros.h is included for each target. // Targetted macros, keccak-macros.h is included for each target.
#define DECL64(x) __m512i x #define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b)) #define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm512_and_si512(a,b)) #define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b)) #define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) #define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n)) #define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) #define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
#include "keccak-macros.c" #include "keccak-macros.c"
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef INPUT_BUF #undef INPUT_BUF
#undef DECL64 #undef DECL64
#undef XOR64 #undef XOR64
#undef XOR
#undef AND64 #undef AND64
#undef OR64 #undef OR64
#undef NOT64 #undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef KECCAK_F_1600 #undef KECCAK_F_1600
#undef XOROR #undef XOROR
#undef XORAND #undef XORAND
#undef XOR3
#endif // AVX512 #endif // AVX512
// AVX2 // AVX2
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
} while (0) } while (0)
#define DECL64(x) __m256i x #define DECL64(x) __m256i x
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) #define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rol_64(v, n)) #define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) #define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c))) #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c" #include "keccak-macros.c"
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
#undef INPUT_BUF #undef INPUT_BUF
#undef DECL64 #undef DECL64
#undef XOR64 #undef XOR64
#undef XOR
#undef AND64 #undef AND64
#undef OR64 #undef OR64
#undef NOT64 #undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
#undef KECCAK_F_1600 #undef KECCAK_F_1600
#undef XOROR #undef XOROR
#undef XORAND #undef XORAND
#undef XOR3
#endif // AVX2 #endif // AVX2

View File

@@ -1,6 +1,19 @@
#ifdef TH_ELT #ifdef TH_ELT
#undef TH_ELT #undef TH_ELT
#endif #endif
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
XOR3( tt0, d0, d1, d4 ); \
XOR( tt1, d2, d3 ); \
XOR( tt0, tt0, tt1 ); \
ROL64( tt0, tt0, 1 ); \
XOR3( tt1, c0, c1, c4 ); \
XOR3( tt0, tt0, c2, c3 ); \
XOR( t, tt0, tt1 ); \
} while (0)
/*
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \ DECL64(tt0); \
DECL64(tt1); \ DECL64(tt1); \
@@ -17,7 +30,7 @@
XOR64(tt2, tt2, tt3); \ XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \ XOR64(t, tt0, tt2); \
} while (0) } while (0)
*/
#ifdef THETA #ifdef THETA
#undef THETA #undef THETA
#endif #endif

View File

@@ -34,6 +34,7 @@
#include "algo/sha/sha-hash-4way.h" #include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h" #include "algo/sha/sha256-hash.h"
#include <mm_malloc.h> #include <mm_malloc.h>
#include "malloc-huge.h"
static const uint32_t keypad[12] = { static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
bool scrypt_miner_thread_init( int thr_id ) bool scrypt_miner_thread_init( int thr_id )
{ {
scratchbuf = _mm_malloc( scratchbuf_size, 128 ); scratchbuf = malloc_hugepages( scratchbuf_size );
if ( scratchbuf ) if ( scratchbuf )
return true; {
if ( opt_debug )
applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
}
else
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
if ( scratchbuf ) return true;
applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id ); applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
return false; return false;
} }
bool register_scrypt_algo( algo_gate_t* gate ) bool register_scrypt_algo( algo_gate_t* gate )

View File

@@ -62,8 +62,8 @@ extern "C"{
#if defined(__AVX2__) #if defined(__AVX2__)
#define DECL_STATE8 \ #define DECL_STATE8 \
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \ __m256i A0, A1, A2, A3, A4, A5, A6, A7, \
A08, A09, A0A, A0B; \ A8, A9, AA, AB; \
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \ __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \ B8, B9, BA, BB, BC, BD, BE, BF; \
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \ __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -78,18 +78,18 @@ extern "C"{
{ \ { \
if ( (state)->state_loaded ) \ if ( (state)->state_loaded ) \
{ \ { \
A00 = (state)->A[0]; \ A0 = (state)->A[0]; \
A01 = (state)->A[1]; \ A1 = (state)->A[1]; \
A02 = (state)->A[2]; \ A2 = (state)->A[2]; \
A03 = (state)->A[3]; \ A3 = (state)->A[3]; \
A04 = (state)->A[4]; \ A4 = (state)->A[4]; \
A05 = (state)->A[5]; \ A5 = (state)->A[5]; \
A06 = (state)->A[6]; \ A6 = (state)->A[6]; \
A07 = (state)->A[7]; \ A7 = (state)->A[7]; \
A08 = (state)->A[8]; \ A8 = (state)->A[8]; \
A09 = (state)->A[9]; \ A9 = (state)->A[9]; \
A0A = (state)->A[10]; \ AA = (state)->A[10]; \
A0B = (state)->A[11]; \ AB = (state)->A[11]; \
B0 = (state)->B[0]; \ B0 = (state)->B[0]; \
B1 = (state)->B[1]; \ B1 = (state)->B[1]; \
B2 = (state)->B[2]; \ B2 = (state)->B[2]; \
@@ -126,18 +126,18 @@ extern "C"{
else \ else \
{ \ { \
(state)->state_loaded = true; \ (state)->state_loaded = true; \
A00 = m256_const1_64( 0x20728DFD20728DFD ); \ A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m256_const1_64( 0xE782B699E782B699 ); \ A2 = m256_const1_64( 0xE782B699E782B699 ); \
A03 = m256_const1_64( 0x5530463255304632 ); \ A3 = m256_const1_64( 0x5530463255304632 ); \
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m256_const1_64( 0x8BD144108BD14410 ); \ A9 = m256_const1_64( 0x8BD144108BD14410 ); \
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \ B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \ B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \ B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -176,18 +176,18 @@ extern "C"{
} while (0) } while (0)
#define WRITE_STATE8(state) do { \ #define WRITE_STATE8(state) do { \
(state)->A[0] = A00; \ (state)->A[0] = A0; \
(state)->A[1] = A01; \ (state)->A[1] = A1; \
(state)->A[2] = A02; \ (state)->A[2] = A2; \
(state)->A[3] = A03; \ (state)->A[3] = A3; \
(state)->A[4] = A04; \ (state)->A[4] = A4; \
(state)->A[5] = A05; \ (state)->A[5] = A5; \
(state)->A[6] = A06; \ (state)->A[6] = A6; \
(state)->A[7] = A07; \ (state)->A[7] = A7; \
(state)->A[8] = A08; \ (state)->A[8] = A8; \
(state)->A[9] = A09; \ (state)->A[9] = A9; \
(state)->A[10] = A0A; \ (state)->A[10] = AA; \
(state)->A[11] = A0B; \ (state)->A[11] = AB; \
(state)->B[0] = B0; \ (state)->B[0] = B0; \
(state)->B[1] = B1; \ (state)->B[1] = B1; \
(state)->B[2] = B2; \ (state)->B[2] = B2; \
@@ -286,8 +286,8 @@ do { \
#define XOR_W8 \ #define XOR_W8 \
do { \ do { \
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \ A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \ A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
} while (0) } while (0)
#define SWAP_BC8 \ #define SWAP_BC8 \
@@ -321,60 +321,60 @@ do { \
} while (0) } while (0)
#define PERM_STEP_0_8 do { \ #define PERM_STEP_0_8 do { \
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \ PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \ PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \ PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \ PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \ PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \ PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \ PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \ PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \ PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \ PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \ PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \ PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \ PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \ PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \ PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \ PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define PERM_STEP_1_8 do { \ #define PERM_STEP_1_8 do { \
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \ PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \ PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \ PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \ PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \ PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \ PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \ PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \ PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \ PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \ PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \ PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \ PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \ PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \ PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \ PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \ PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define PERM_STEP_2_8 do { \ #define PERM_STEP_2_8 do { \
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \ PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \ PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \ PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \ PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \ PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \ PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \ PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \ PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \ PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \ PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \ PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \ PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \ PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \ PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \ PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \ PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define APPLY_P8 \ #define APPLY_P8 \
@@ -398,42 +398,42 @@ do { \
PERM_STEP_0_8; \ PERM_STEP_0_8; \
PERM_STEP_1_8; \ PERM_STEP_1_8; \
PERM_STEP_2_8; \ PERM_STEP_2_8; \
A0B = _mm256_add_epi32( A0B, C6 ); \ AB = _mm256_add_epi32( AB, C6 ); \
A0A = _mm256_add_epi32( A0A, C5 ); \ AA = _mm256_add_epi32( AA, C5 ); \
A09 = _mm256_add_epi32( A09, C4 ); \ A9 = _mm256_add_epi32( A9, C4 ); \
A08 = _mm256_add_epi32( A08, C3 ); \ A8 = _mm256_add_epi32( A8, C3 ); \
A07 = _mm256_add_epi32( A07, C2 ); \ A7 = _mm256_add_epi32( A7, C2 ); \
A06 = _mm256_add_epi32( A06, C1 ); \ A6 = _mm256_add_epi32( A6, C1 ); \
A05 = _mm256_add_epi32( A05, C0 ); \ A5 = _mm256_add_epi32( A5, C0 ); \
A04 = _mm256_add_epi32( A04, CF ); \ A4 = _mm256_add_epi32( A4, CF ); \
A03 = _mm256_add_epi32( A03, CE ); \ A3 = _mm256_add_epi32( A3, CE ); \
A02 = _mm256_add_epi32( A02, CD ); \ A2 = _mm256_add_epi32( A2, CD ); \
A01 = _mm256_add_epi32( A01, CC ); \ A1 = _mm256_add_epi32( A1, CC ); \
A00 = _mm256_add_epi32( A00, CB ); \ A0 = _mm256_add_epi32( A0, CB ); \
A0B = _mm256_add_epi32( A0B, CA ); \ AB = _mm256_add_epi32( AB, CA ); \
A0A = _mm256_add_epi32( A0A, C9 ); \ AA = _mm256_add_epi32( AA, C9 ); \
A09 = _mm256_add_epi32( A09, C8 ); \ A9 = _mm256_add_epi32( A9, C8 ); \
A08 = _mm256_add_epi32( A08, C7 ); \ A8 = _mm256_add_epi32( A8, C7 ); \
A07 = _mm256_add_epi32( A07, C6 ); \ A7 = _mm256_add_epi32( A7, C6 ); \
A06 = _mm256_add_epi32( A06, C5 ); \ A6 = _mm256_add_epi32( A6, C5 ); \
A05 = _mm256_add_epi32( A05, C4 ); \ A5 = _mm256_add_epi32( A5, C4 ); \
A04 = _mm256_add_epi32( A04, C3 ); \ A4 = _mm256_add_epi32( A4, C3 ); \
A03 = _mm256_add_epi32( A03, C2 ); \ A3 = _mm256_add_epi32( A3, C2 ); \
A02 = _mm256_add_epi32( A02, C1 ); \ A2 = _mm256_add_epi32( A2, C1 ); \
A01 = _mm256_add_epi32( A01, C0 ); \ A1 = _mm256_add_epi32( A1, C0 ); \
A00 = _mm256_add_epi32( A00, CF ); \ A0 = _mm256_add_epi32( A0, CF ); \
A0B = _mm256_add_epi32( A0B, CE ); \ AB = _mm256_add_epi32( AB, CE ); \
A0A = _mm256_add_epi32( A0A, CD ); \ AA = _mm256_add_epi32( AA, CD ); \
A09 = _mm256_add_epi32( A09, CC ); \ A9 = _mm256_add_epi32( A9, CC ); \
A08 = _mm256_add_epi32( A08, CB ); \ A8 = _mm256_add_epi32( A8, CB ); \
A07 = _mm256_add_epi32( A07, CA ); \ A7 = _mm256_add_epi32( A7, CA ); \
A06 = _mm256_add_epi32( A06, C9 ); \ A6 = _mm256_add_epi32( A6, C9 ); \
A05 = _mm256_add_epi32( A05, C8 ); \ A5 = _mm256_add_epi32( A5, C8 ); \
A04 = _mm256_add_epi32( A04, C7 ); \ A4 = _mm256_add_epi32( A4, C7 ); \
A03 = _mm256_add_epi32( A03, C6 ); \ A3 = _mm256_add_epi32( A3, C6 ); \
A02 = _mm256_add_epi32( A02, C5 ); \ A2 = _mm256_add_epi32( A2, C5 ); \
A01 = _mm256_add_epi32( A01, C4 ); \ A1 = _mm256_add_epi32( A1, C4 ); \
A00 = _mm256_add_epi32( A00, C3 ); \ A0 = _mm256_add_epi32( A0, C3 ); \
} while (0) } while (0)
#define INCR_W8 do { \ #define INCR_W8 do { \
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define DECL_STATE \ #define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \ __m128i A0, A1, A2, A3, A4, A5, A6, A7, \
A08, A09, A0A, A0B; \ A8, A9, AA, AB; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \ __m128i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \ B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \ __m128i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{ \ { \
if ( (state)->state_loaded ) \ if ( (state)->state_loaded ) \
{ \ { \
A00 = (state)->A[0]; \ A0 = (state)->A[0]; \
A01 = (state)->A[1]; \ A1 = (state)->A[1]; \
A02 = (state)->A[2]; \ A2 = (state)->A[2]; \
A03 = (state)->A[3]; \ A3 = (state)->A[3]; \
A04 = (state)->A[4]; \ A4 = (state)->A[4]; \
A05 = (state)->A[5]; \ A5 = (state)->A[5]; \
A06 = (state)->A[6]; \ A6 = (state)->A[6]; \
A07 = (state)->A[7]; \ A7 = (state)->A[7]; \
A08 = (state)->A[8]; \ A8 = (state)->A[8]; \
A09 = (state)->A[9]; \ A9 = (state)->A[9]; \
A0A = (state)->A[10]; \ AA = (state)->A[10]; \
A0B = (state)->A[11]; \ AB = (state)->A[11]; \
B0 = (state)->B[0]; \ B0 = (state)->B[0]; \
B1 = (state)->B[1]; \ B1 = (state)->B[1]; \
B2 = (state)->B[2]; \ B2 = (state)->B[2]; \
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \ else \
{ \ { \
(state)->state_loaded = true; \ (state)->state_loaded = true; \
A00 = m128_const1_64( 0x20728DFD20728DFD ); \ A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m128_const1_64( 0xE782B699E782B699 ); \ A2 = m128_const1_64( 0xE782B699E782B699 ); \
A03 = m128_const1_64( 0x5530463255304632 ); \ A3 = m128_const1_64( 0x5530463255304632 ); \
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m128_const1_64( 0x8BD144108BD14410 ); \ A9 = m128_const1_64( 0x8BD144108BD14410 ); \
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \ B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \ B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \ B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
} while (0) } while (0)
#define WRITE_STATE(state) do { \ #define WRITE_STATE(state) do { \
(state)->A[0] = A00; \ (state)->A[0] = A0; \
(state)->A[1] = A01; \ (state)->A[1] = A1; \
(state)->A[2] = A02; \ (state)->A[2] = A2; \
(state)->A[3] = A03; \ (state)->A[3] = A3; \
(state)->A[4] = A04; \ (state)->A[4] = A4; \
(state)->A[5] = A05; \ (state)->A[5] = A5; \
(state)->A[6] = A06; \ (state)->A[6] = A6; \
(state)->A[7] = A07; \ (state)->A[7] = A7; \
(state)->A[8] = A08; \ (state)->A[8] = A8; \
(state)->A[9] = A09; \ (state)->A[9] = A9; \
(state)->A[10] = A0A; \ (state)->A[10] = AA; \
(state)->A[11] = A0B; \ (state)->A[11] = AB; \
(state)->B[0] = B0; \ (state)->B[0] = B0; \
(state)->B[1] = B1; \ (state)->B[1] = B1; \
(state)->B[2] = B2; \ (state)->B[2] = B2; \
@@ -884,8 +884,8 @@ do { \
#define XOR_W \ #define XOR_W \
do { \ do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \ A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \ A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
} while (0) } while (0)
@@ -940,60 +940,60 @@ do { \
} while (0) } while (0)
#define PERM_STEP_0 do { \ #define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define PERM_STEP_1 do { \ #define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define PERM_STEP_2 do { \ #define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0) } while (0)
#define APPLY_P \ #define APPLY_P \
@@ -1017,42 +1017,42 @@ do { \
PERM_STEP_0; \ PERM_STEP_0; \
PERM_STEP_1; \ PERM_STEP_1; \
PERM_STEP_2; \ PERM_STEP_2; \
A0B = _mm_add_epi32( A0B, C6 ); \ AB = _mm_add_epi32( AB, C6 ); \
A0A = _mm_add_epi32( A0A, C5 ); \ AA = _mm_add_epi32( AA, C5 ); \
A09 = _mm_add_epi32( A09, C4 ); \ A9 = _mm_add_epi32( A9, C4 ); \
A08 = _mm_add_epi32( A08, C3 ); \ A8 = _mm_add_epi32( A8, C3 ); \
A07 = _mm_add_epi32( A07, C2 ); \ A7 = _mm_add_epi32( A7, C2 ); \
A06 = _mm_add_epi32( A06, C1 ); \ A6 = _mm_add_epi32( A6, C1 ); \
A05 = _mm_add_epi32( A05, C0 ); \ A5 = _mm_add_epi32( A5, C0 ); \
A04 = _mm_add_epi32( A04, CF ); \ A4 = _mm_add_epi32( A4, CF ); \
A03 = _mm_add_epi32( A03, CE ); \ A3 = _mm_add_epi32( A3, CE ); \
A02 = _mm_add_epi32( A02, CD ); \ A2 = _mm_add_epi32( A2, CD ); \
A01 = _mm_add_epi32( A01, CC ); \ A1 = _mm_add_epi32( A1, CC ); \
A00 = _mm_add_epi32( A00, CB ); \ A0 = _mm_add_epi32( A0, CB ); \
A0B = _mm_add_epi32( A0B, CA ); \ AB = _mm_add_epi32( AB, CA ); \
A0A = _mm_add_epi32( A0A, C9 ); \ AA = _mm_add_epi32( AA, C9 ); \
A09 = _mm_add_epi32( A09, C8 ); \ A9 = _mm_add_epi32( A9, C8 ); \
A08 = _mm_add_epi32( A08, C7 ); \ A8 = _mm_add_epi32( A8, C7 ); \
A07 = _mm_add_epi32( A07, C6 ); \ A7 = _mm_add_epi32( A7, C6 ); \
A06 = _mm_add_epi32( A06, C5 ); \ A6 = _mm_add_epi32( A6, C5 ); \
A05 = _mm_add_epi32( A05, C4 ); \ A5 = _mm_add_epi32( A5, C4 ); \
A04 = _mm_add_epi32( A04, C3 ); \ A4 = _mm_add_epi32( A4, C3 ); \
A03 = _mm_add_epi32( A03, C2 ); \ A3 = _mm_add_epi32( A3, C2 ); \
A02 = _mm_add_epi32( A02, C1 ); \ A2 = _mm_add_epi32( A2, C1 ); \
A01 = _mm_add_epi32( A01, C0 ); \ A1 = _mm_add_epi32( A1, C0 ); \
A00 = _mm_add_epi32( A00, CF ); \ A0 = _mm_add_epi32( A0, CF ); \
A0B = _mm_add_epi32( A0B, CE ); \ AB = _mm_add_epi32( AB, CE ); \
A0A = _mm_add_epi32( A0A, CD ); \ AA = _mm_add_epi32( AA, CD ); \
A09 = _mm_add_epi32( A09, CC ); \ A9 = _mm_add_epi32( A9, CC ); \
A08 = _mm_add_epi32( A08, CB ); \ A8 = _mm_add_epi32( A8, CB ); \
A07 = _mm_add_epi32( A07, CA ); \ A7 = _mm_add_epi32( A7, CA ); \
A06 = _mm_add_epi32( A06, C9 ); \ A6 = _mm_add_epi32( A6, C9 ); \
A05 = _mm_add_epi32( A05, C8 ); \ A5 = _mm_add_epi32( A5, C8 ); \
A04 = _mm_add_epi32( A04, C7 ); \ A4 = _mm_add_epi32( A4, C7 ); \
A03 = _mm_add_epi32( A03, C6 ); \ A3 = _mm_add_epi32( A3, C6 ); \
A02 = _mm_add_epi32( A02, C5 ); \ A2 = _mm_add_epi32( A2, C5 ); \
A01 = _mm_add_epi32( A01, C4 ); \ A1 = _mm_add_epi32( A1, C4 ); \
A00 = _mm_add_epi32( A00, C3 ); \ A0 = _mm_add_epi32( A0, C3 ); \
} while (0) } while (0)
#define INCR_W do { \ #define INCR_W do { \

View File

@@ -10,6 +10,7 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include "Verthash.h" #include "Verthash.h"
#include "mm_malloc.h" #include "mm_malloc.h"
#include "malloc-huge.h"
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// Verthash info management // Verthash info management
@@ -84,12 +85,18 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
} }
// Allocate data // Allocate data
info->data = (uint8_t *)_mm_malloc( fileSize, 64 ); info->data = (uint8_t *)malloc_hugepages( fileSize );
if (!info->data) if ( info->data )
if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
else
{ {
fclose(fileMiningData); info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
// Memory allocation fatal error. if (!info->data)
return 2; {
fclose(fileMiningData);
// Memory allocation fatal error.
return 2;
}
} }
// Load data // Load data

View File

@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
for ( r = 0; r < KECCAKF_ROUNDS; r++ ) for ( r = 0; r < KECCAKF_ROUNDS; r++ )
{ {
// Theta // Theta
bc[0] = _mm256_xor_si256( st[0], bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
mm256_xor4( st[5], st[10], st[15], st[20] ) ); bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
bc[1] = _mm256_xor_si256( st[1], bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
mm256_xor4( st[6], st[11], st[16], st[21] ) ); bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
bc[2] = _mm256_xor_si256( st[2], bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );
mm256_xor4( st[7], st[12], st[17], st[22] ) );
bc[3] = _mm256_xor_si256( st[3],
mm256_xor4( st[8], st[13], st[18], st[23] ) );
bc[4] = _mm256_xor_si256( st[4],
mm256_xor4( st[9], st[14], st[19], st[24] ) );
for ( i = 0; i < 5; i++ ) for ( i = 0; i < 5; i++ )
{ {
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
// Chi // Chi
for ( j = 0; j < 25; j += 5 ) for ( j = 0; j < 25; j += 5 )
{ {
memcpy( bc, &st[ j ], 5*32 ); bc[0] = st[j];
st[ j ] = _mm256_xor_si256( st[ j ], bc[1] = st[j+1];
_mm256_andnot_si256( bc[1], bc[2] ) ); st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] );
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ], st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
_mm256_andnot_si256( bc[2], bc[3] ) ); st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ], st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
_mm256_andnot_si256( bc[3], bc[4] ) ); st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
_mm256_andnot_si256( bc[4], bc[0] ) );
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
_mm256_andnot_si256( bc[0], bc[1] ) );
} }
// Iota // Iota

View File

@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{ {
opt_target_factor = 256.0; opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash; gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = AVX2_OPT; gate->optimizations = SSE42_OPT | AVX2_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file; : default_verthash_data_file;

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.19.2' PACKAGE_VERSION='3.19.3'
PACKAGE_STRING='cpuminer-opt 3.19.2' PACKAGE_STRING='cpuminer-opt 3.19.3'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";; short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.19.2 cpuminer-opt configure 3.19.3
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.19.2, which was It was created by cpuminer-opt $as_me 3.19.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.19.2' VERSION='3.19.3'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.19.2, which was This file was extended by cpuminer-opt $as_me 3.19.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.19.2 cpuminer-opt config.status 3.19.3
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.19.2]) AC_INIT([cpuminer-opt], [3.19.3])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

36
malloc-huge.c Normal file
View File

@@ -0,0 +1,36 @@
#include "malloc-huge.h"
#include "miner.h"
#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024)
void *malloc_hugepages( size_t size )
{
#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
// applog( LOG_WARNING, "Huge pages not available",size);
return NULL;
#else
if ( size < HUGEPAGE_MIN_ALLOC )
{
// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
return NULL;
}
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
void *p = NULL;
int flags =
#ifdef MAP_NOCORE
MAP_NOCORE |
#endif
MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
// round size up to next page boundary
size = ( size + hugepage_mask ) & (~hugepage_mask);
p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
if ( p == MAP_FAILED )
p = NULL;
return p;
#endif
}

24
malloc-huge.h Normal file
View File

@@ -0,0 +1,24 @@
#if !(defined(MALLOC_HUGE__))
#define MALLOC_HUGE__
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#ifdef __unix__
#include <sys/mman.h>
#endif
#if defined(MAP_HUGETLB)
// Minimum block size 6 MiB to use huge pages
#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024)
#endif
// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
void *malloc_hugepages( size_t size );
#endif

View File

@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
./clean-all.sh || echo clean ./clean-all.sh || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512.exe mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3 # AVX2 SHA VAES: Intel Alderlake, AMD Zen3
make clean || echo done make clean || echo done
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
# AVX2 AES SHA: AMD Zen1 # AVX2 AES SHA: AMD Zen1
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha.exe mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2.exe mv cpuminer.exe release/cpuminer-avx2.exe
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
# AVX AES: Intel Sandybridge, Ivybridge # AVX AES: Intel Sandybridge, Ivybridge
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx.exe mv cpuminer.exe release/cpuminer-avx.exe
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
# SSE4.2 AES: Intel Westmere # SSE4.2 AES: Intel Westmere
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-sse42.exe mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
# Generic SSE2 # Generic SSE2
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8 make -j 8
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe