mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.19.3
This commit is contained in:
@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
|
||||
api.c \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
malloc-huge.c \
|
||||
algo/argon2/argon2a/argon2a.c \
|
||||
algo/argon2/argon2a/ar2/argon2.c \
|
||||
algo/argon2/argon2a/ar2/opt.c \
|
||||
|
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.19.3
|
||||
|
||||
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
|
||||
|
||||
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
|
||||
|
||||
v3.19.2
|
||||
|
||||
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
|
||||
|
@@ -37,6 +37,13 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static inline __m512i blamka( __m512i x, __m512i y )
|
||||
{
|
||||
__m512i xy = _mm512_mul_epu32( x, y );
|
||||
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
|
||||
_mm512_add_epi64( xy, xy ) );
|
||||
}
|
||||
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
|
@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
|
||||
|
||||
static __m512i muladd(__m512i x, __m512i y)
|
||||
static inline __m512i muladd(__m512i x, __m512i y)
|
||||
{
|
||||
__m512i z = _mm512_mul_epu32(x, y);
|
||||
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
||||
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ROR64(D0, 32); \
|
||||
D1 = ROR64(D1, 32); \
|
||||
D0 = _mm512_ror_epi64(D0, 32); \
|
||||
D1 = _mm512_ror_epi64(D1, 32); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ROR64(B0, 24); \
|
||||
B1 = ROR64(B1, 24); \
|
||||
B0 = _mm512_ror_epi64(B0, 24); \
|
||||
B1 = _mm512_ror_epi64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
D0 = _mm512_xor_si512(D0, A0); \
|
||||
D1 = _mm512_xor_si512(D1, A1); \
|
||||
\
|
||||
D0 = ROR64(D0, 16); \
|
||||
D1 = ROR64(D1, 16); \
|
||||
D0 = _mm512_ror_epi64(D0, 16); \
|
||||
D1 = _mm512_ror_epi64(D1, 16); \
|
||||
\
|
||||
C0 = muladd(C0, D0); \
|
||||
C1 = muladd(C1, D1); \
|
||||
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
B0 = _mm512_xor_si512(B0, C0); \
|
||||
B1 = _mm512_xor_si512(B1, C1); \
|
||||
\
|
||||
B0 = ROR64(B0, 63); \
|
||||
B1 = ROR64(B1, 63); \
|
||||
B0 = _mm512_ror_epi64(B0, 63); \
|
||||
B1 = _mm512_ror_epi64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
|
||||
#define SWAP_HALVES(A0, A1) \
|
||||
do { \
|
||||
__m512i t0, t1; \
|
||||
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
A0 = t0; \
|
||||
A1 = t1; \
|
||||
__m512i t; \
|
||||
t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
A0 = t; \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
|
@@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = {
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
|
||||
// produces the same result but is faster.
|
||||
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
|
||||
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
|
||||
// on i9-9940x cmplt with zero was 3% faster than movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
do { \
|
||||
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
|
||||
const __m512i zero = m512_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
|
||||
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
|
||||
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
|
||||
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
|
||||
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
|
||||
@@ -573,29 +575,6 @@ do { \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define INPUT_BIG8 \
|
||||
do { \
|
||||
__m512i db = *buf; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
|
||||
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
|
||||
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
|
||||
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
|
||||
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
|
||||
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
|
||||
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
|
||||
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
|
||||
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
|
||||
tp += 8; \
|
||||
db = _mm512_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
__m512i t; \
|
||||
@@ -632,199 +611,192 @@ do { \
|
||||
|
||||
#define READ_STATE_BIG8(sc) \
|
||||
do { \
|
||||
c0 = sc->h[0x0]; \
|
||||
c1 = sc->h[0x1]; \
|
||||
c2 = sc->h[0x2]; \
|
||||
c3 = sc->h[0x3]; \
|
||||
c4 = sc->h[0x4]; \
|
||||
c5 = sc->h[0x5]; \
|
||||
c6 = sc->h[0x6]; \
|
||||
c7 = sc->h[0x7]; \
|
||||
c0 = sc->h[0]; \
|
||||
c1 = sc->h[1]; \
|
||||
c2 = sc->h[2]; \
|
||||
c3 = sc->h[3]; \
|
||||
c4 = sc->h[4]; \
|
||||
c5 = sc->h[5]; \
|
||||
c6 = sc->h[6]; \
|
||||
c7 = sc->h[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG8(sc) \
|
||||
do { \
|
||||
sc->h[0x0] = c0; \
|
||||
sc->h[0x1] = c1; \
|
||||
sc->h[0x2] = c2; \
|
||||
sc->h[0x3] = c3; \
|
||||
sc->h[0x4] = c4; \
|
||||
sc->h[0x5] = c5; \
|
||||
sc->h[0x6] = c6; \
|
||||
sc->h[0x7] = c7; \
|
||||
sc->h[0] = c0; \
|
||||
sc->h[1] = c1; \
|
||||
sc->h[2] = c2; \
|
||||
sc->h[3] = c3; \
|
||||
sc->h[4] = c4; \
|
||||
sc->h[5] = c5; \
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define ROUND_BIG8( alpha ) \
|
||||
do { \
|
||||
__m512i t0, t1, t2, t3; \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
|
||||
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
|
||||
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
|
||||
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
|
||||
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
|
||||
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
|
||||
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
|
||||
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
|
||||
sA = _mm512_xor_si512( sA, alpha[10] ); \
|
||||
sB = _mm512_xor_si512( sB, alpha[11] ); \
|
||||
sC = _mm512_xor_si512( sC, alpha[12] ); \
|
||||
sD = _mm512_xor_si512( sD, alpha[13] ); \
|
||||
sE = _mm512_xor_si512( sE, alpha[14] ); \
|
||||
sF = _mm512_xor_si512( sF, alpha[15] ); \
|
||||
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
|
||||
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
|
||||
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
|
||||
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
|
||||
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
|
||||
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
|
||||
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
|
||||
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
|
||||
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
|
||||
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
|
||||
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
|
||||
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
|
||||
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
|
||||
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
|
||||
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
|
||||
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
|
||||
\
|
||||
SBOX8( s0, s4, s8, sC ); \
|
||||
SBOX8( s1, s5, s9, sD ); \
|
||||
SBOX8( s2, s6, sA, sE ); \
|
||||
SBOX8( s3, s7, sB, sF ); \
|
||||
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
|
||||
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
|
||||
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
|
||||
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
|
||||
_mm512_bslli_epi128( s5, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
|
||||
_mm512_bslli_epi128( sE, 4 ) ); \
|
||||
s4 = mm512_swap64_32( s4 ); \
|
||||
s5 = mm512_swap64_32( s5 ); \
|
||||
sD = mm512_swap64_32( sD ); \
|
||||
sE = mm512_swap64_32( sE ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
||||
L8( s0, t1, s9, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( s6, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
|
||||
_mm512_bslli_epi128( sF, 4 ) ); \
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
|
||||
L8( s1, t1, sA, t3 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
|
||||
_mm512_bslli_epi128( s7, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
|
||||
_mm512_bslli_epi128( sC, 4 ) ); \
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
||||
L8( s2, t1, sB, t3 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
|
||||
s6 = mm512_swap64_32( s6 ); \
|
||||
sF = mm512_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
|
||||
_mm512_bslli_epi128( s4, 4 ) ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
|
||||
L8( s3, t1, s8, t3 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
|
||||
s7 = mm512_swap64_32( s7 ); \
|
||||
sC = mm512_swap64_32( sC ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
|
||||
_mm512_bslli_epi128( sB, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
|
||||
t3 = mm512_swap64_32( t3 ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
t3 = mm512_swap64_32( t3 ); \
|
||||
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
|
||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
|
||||
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
||||
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
|
||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
|
||||
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
|
||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
|
||||
\
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
|
||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
|
||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
|
||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
|
||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
|
||||
L8( t0, t1, t2, t3 ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
|
||||
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
|
||||
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
|
||||
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
|
||||
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
|
||||
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
|
||||
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
|
||||
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
|
||||
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||
s4 = mm512_swap64_32( s4 ); \
|
||||
s5 = mm512_swap64_32( s5 ); \
|
||||
sD = mm512_swap64_32( sD ); \
|
||||
sE = mm512_swap64_32( sE ); \
|
||||
} while (0)
|
||||
|
||||
#define P_BIG8 \
|
||||
do { \
|
||||
__m512i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG8 \
|
||||
do { \
|
||||
__m512i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG8( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG8 \
|
||||
do { /* order is important */ \
|
||||
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
|
||||
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
|
||||
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
|
||||
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
|
||||
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
|
||||
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
|
||||
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
|
||||
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
|
||||
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
|
||||
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
|
||||
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
|
||||
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
|
||||
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
|
||||
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
|
||||
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
|
||||
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
|
||||
} while (0)
|
||||
|
||||
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
||||
@@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
|
||||
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
@@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 63; u >= 0; u-- ) \
|
||||
{ \
|
||||
__m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
|
||||
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
|
||||
m256_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
|
||||
@@ -933,7 +905,6 @@ do { \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
|
||||
m256_const1_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
db = _mm256_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -982,47 +953,28 @@ do { \
|
||||
|
||||
#define READ_STATE_BIG(sc) \
|
||||
do { \
|
||||
c0 = sc->h[0x0]; \
|
||||
c1 = sc->h[0x1]; \
|
||||
c2 = sc->h[0x2]; \
|
||||
c3 = sc->h[0x3]; \
|
||||
c4 = sc->h[0x4]; \
|
||||
c5 = sc->h[0x5]; \
|
||||
c6 = sc->h[0x6]; \
|
||||
c7 = sc->h[0x7]; \
|
||||
c0 = sc->h[0]; \
|
||||
c1 = sc->h[1]; \
|
||||
c2 = sc->h[2]; \
|
||||
c3 = sc->h[3]; \
|
||||
c4 = sc->h[4]; \
|
||||
c5 = sc->h[5]; \
|
||||
c6 = sc->h[6]; \
|
||||
c7 = sc->h[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG(sc) \
|
||||
do { \
|
||||
sc->h[0x0] = c0; \
|
||||
sc->h[0x1] = c1; \
|
||||
sc->h[0x2] = c2; \
|
||||
sc->h[0x3] = c3; \
|
||||
sc->h[0x4] = c4; \
|
||||
sc->h[0x5] = c5; \
|
||||
sc->h[0x6] = c6; \
|
||||
sc->h[0x7] = c7; \
|
||||
sc->h[0] = c0; \
|
||||
sc->h[1] = c1; \
|
||||
sc->h[2] = c2; \
|
||||
sc->h[3] = c3; \
|
||||
sc->h[4] = c4; \
|
||||
sc->h[5] = c5; \
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define s0 m0
|
||||
#define s1 c0
|
||||
#define s2 m1
|
||||
#define s3 c1
|
||||
#define s4 c2
|
||||
#define s5 m2
|
||||
#define s6 c3
|
||||
#define s7 m3
|
||||
#define s8 m4
|
||||
#define s9 c4
|
||||
#define sA m5
|
||||
#define sB c5
|
||||
#define sC c6
|
||||
#define sD m6
|
||||
#define sE c7
|
||||
#define sF m7
|
||||
*/
|
||||
|
||||
#define ROUND_BIG( alpha ) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
@@ -1048,151 +1000,145 @@ do { \
|
||||
SBOX( s2, s6, sA, sE ); \
|
||||
SBOX( s3, s7, sB, sF ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
|
||||
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
|
||||
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
|
||||
s4 = mm256_swap64_32( s4 ); \
|
||||
s5 = mm256_swap64_32( s5 ); \
|
||||
sD = mm256_swap64_32( sD ); \
|
||||
sE = mm256_swap64_32( sE ); \
|
||||
t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
|
||||
L( s0, t1, s9, t3 ); \
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
|
||||
sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
|
||||
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
|
||||
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
|
||||
L( s1, t1, sA, t3 ); \
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
|
||||
s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
|
||||
sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
|
||||
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
|
||||
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
|
||||
L( s2, t1, sB, t3 ); \
|
||||
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
|
||||
s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
|
||||
sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
|
||||
s6 = mm256_swap64_32( s6 ); \
|
||||
sF = mm256_swap64_32( sF ); \
|
||||
\
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
|
||||
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
|
||||
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
|
||||
t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
|
||||
L( s3, t1, s8, t3 ); \
|
||||
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
|
||||
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
|
||||
s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
|
||||
sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
|
||||
s7 = mm256_swap64_32( s7 ); \
|
||||
sC = mm256_swap64_32( sC ); \
|
||||
\
|
||||
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
|
||||
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
|
||||
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
|
||||
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
|
||||
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
|
||||
t3 = mm256_swap64_32( t3 ); \
|
||||
L( t0, t1, t2, t3 ); \
|
||||
t3 = mm256_swap64_32( t3 ); \
|
||||
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
|
||||
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
|
||||
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
|
||||
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
|
||||
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
|
||||
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
|
||||
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
|
||||
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
|
||||
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
|
||||
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
|
||||
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
|
||||
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
|
||||
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
|
||||
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
|
||||
\
|
||||
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
|
||||
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
|
||||
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
|
||||
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
|
||||
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
|
||||
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
|
||||
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
|
||||
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
|
||||
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
|
||||
L( t0, t1, t2, t3 ); \
|
||||
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
|
||||
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
|
||||
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
|
||||
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
|
||||
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
|
||||
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
|
||||
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
|
||||
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
|
||||
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
|
||||
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
|
||||
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
|
||||
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
|
||||
s4 = mm256_swap64_32( s4 ); \
|
||||
s5 = mm256_swap64_32( s5 ); \
|
||||
sD = mm256_swap64_32( sD ); \
|
||||
sE = mm256_swap64_32( sE ); \
|
||||
} while (0)
|
||||
|
||||
#define P_BIG \
|
||||
do { \
|
||||
__m256i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_n )[0] ); \
|
||||
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define PF_BIG \
|
||||
do { \
|
||||
__m256i alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
|
||||
^ ( (uint64_t*)alpha_f )[0] ); \
|
||||
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
|
||||
ROUND_BIG( alpha ); \
|
||||
} while (0)
|
||||
|
||||
#define T_BIG \
|
||||
do { /* order is important */ \
|
||||
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
|
||||
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
|
||||
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
|
||||
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
|
||||
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
|
||||
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
|
||||
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
|
||||
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
|
||||
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
|
||||
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
|
||||
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
|
||||
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
|
||||
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
|
||||
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
|
||||
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
|
||||
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
|
||||
} while (0)
|
||||
|
||||
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
||||
|
@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
|
||||
#define WRITE_STATE(sc)
|
||||
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64_IOTA XOR64
|
||||
#define XOR64_IOTA XOR
|
||||
|
||||
|
||||
#define LPAR (
|
||||
#define RPAR )
|
||||
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
|
||||
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef XOR
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#undef KECCAK_F_1600
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
|
||||
#undef XOR3
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2
|
||||
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
} while (0)
|
||||
|
||||
#define DECL64(x) __m256i x
|
||||
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef XOR
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
|
||||
#undef KECCAK_F_1600
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
|
||||
#endif // AVX2
|
||||
|
@@ -1,6 +1,19 @@
|
||||
#ifdef TH_ELT
|
||||
#undef TH_ELT
|
||||
#endif
|
||||
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
XOR3( tt0, d0, d1, d4 ); \
|
||||
XOR( tt1, d2, d3 ); \
|
||||
XOR( tt0, tt0, tt1 ); \
|
||||
ROL64( tt0, tt0, 1 ); \
|
||||
XOR3( tt1, c0, c1, c4 ); \
|
||||
XOR3( tt0, tt0, c2, c3 ); \
|
||||
XOR( t, tt0, tt1 ); \
|
||||
} while (0)
|
||||
/*
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
@@ -17,7 +30,7 @@
|
||||
XOR64(tt2, tt2, tt3); \
|
||||
XOR64(t, tt0, tt2); \
|
||||
} while (0)
|
||||
|
||||
*/
|
||||
#ifdef THETA
|
||||
#undef THETA
|
||||
#endif
|
||||
|
@@ -34,6 +34,7 @@
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include <mm_malloc.h>
|
||||
#include "malloc-huge.h"
|
||||
|
||||
static const uint32_t keypad[12] = {
|
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
|
||||
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool scrypt_miner_thread_init( int thr_id )
|
||||
{
|
||||
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
|
||||
scratchbuf = malloc_hugepages( scratchbuf_size );
|
||||
if ( scratchbuf )
|
||||
return true;
|
||||
{
|
||||
if ( opt_debug )
|
||||
applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
|
||||
}
|
||||
else
|
||||
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
|
||||
|
||||
if ( scratchbuf ) return true;
|
||||
|
||||
applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
|
@@ -62,8 +62,8 @@ extern "C"{
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define DECL_STATE8 \
|
||||
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
|
||||
A08, A09, A0A, A0B; \
|
||||
__m256i A0, A1, A2, A3, A4, A5, A6, A7, \
|
||||
A8, A9, AA, AB; \
|
||||
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||
@@ -78,18 +78,18 @@ extern "C"{
|
||||
{ \
|
||||
if ( (state)->state_loaded ) \
|
||||
{ \
|
||||
A00 = (state)->A[0]; \
|
||||
A01 = (state)->A[1]; \
|
||||
A02 = (state)->A[2]; \
|
||||
A03 = (state)->A[3]; \
|
||||
A04 = (state)->A[4]; \
|
||||
A05 = (state)->A[5]; \
|
||||
A06 = (state)->A[6]; \
|
||||
A07 = (state)->A[7]; \
|
||||
A08 = (state)->A[8]; \
|
||||
A09 = (state)->A[9]; \
|
||||
A0A = (state)->A[10]; \
|
||||
A0B = (state)->A[11]; \
|
||||
A0 = (state)->A[0]; \
|
||||
A1 = (state)->A[1]; \
|
||||
A2 = (state)->A[2]; \
|
||||
A3 = (state)->A[3]; \
|
||||
A4 = (state)->A[4]; \
|
||||
A5 = (state)->A[5]; \
|
||||
A6 = (state)->A[6]; \
|
||||
A7 = (state)->A[7]; \
|
||||
A8 = (state)->A[8]; \
|
||||
A9 = (state)->A[9]; \
|
||||
AA = (state)->A[10]; \
|
||||
AB = (state)->A[11]; \
|
||||
B0 = (state)->B[0]; \
|
||||
B1 = (state)->B[1]; \
|
||||
B2 = (state)->B[2]; \
|
||||
@@ -126,18 +126,18 @@ extern "C"{
|
||||
else \
|
||||
{ \
|
||||
(state)->state_loaded = true; \
|
||||
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
|
||||
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
|
||||
A02 = m256_const1_64( 0xE782B699E782B699 ); \
|
||||
A03 = m256_const1_64( 0x5530463255304632 ); \
|
||||
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
|
||||
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
|
||||
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
|
||||
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
|
||||
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
|
||||
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
|
||||
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
|
||||
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
|
||||
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
|
||||
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
|
||||
A2 = m256_const1_64( 0xE782B699E782B699 ); \
|
||||
A3 = m256_const1_64( 0x5530463255304632 ); \
|
||||
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
|
||||
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
|
||||
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
|
||||
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
|
||||
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
|
||||
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
|
||||
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
|
||||
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
|
||||
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
|
||||
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
|
||||
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
|
||||
@@ -176,18 +176,18 @@ extern "C"{
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE8(state) do { \
|
||||
(state)->A[0] = A00; \
|
||||
(state)->A[1] = A01; \
|
||||
(state)->A[2] = A02; \
|
||||
(state)->A[3] = A03; \
|
||||
(state)->A[4] = A04; \
|
||||
(state)->A[5] = A05; \
|
||||
(state)->A[6] = A06; \
|
||||
(state)->A[7] = A07; \
|
||||
(state)->A[8] = A08; \
|
||||
(state)->A[9] = A09; \
|
||||
(state)->A[10] = A0A; \
|
||||
(state)->A[11] = A0B; \
|
||||
(state)->A[0] = A0; \
|
||||
(state)->A[1] = A1; \
|
||||
(state)->A[2] = A2; \
|
||||
(state)->A[3] = A3; \
|
||||
(state)->A[4] = A4; \
|
||||
(state)->A[5] = A5; \
|
||||
(state)->A[6] = A6; \
|
||||
(state)->A[7] = A7; \
|
||||
(state)->A[8] = A8; \
|
||||
(state)->A[9] = A9; \
|
||||
(state)->A[10] = AA; \
|
||||
(state)->A[11] = AB; \
|
||||
(state)->B[0] = B0; \
|
||||
(state)->B[1] = B1; \
|
||||
(state)->B[2] = B2; \
|
||||
@@ -286,8 +286,8 @@ do { \
|
||||
|
||||
#define XOR_W8 \
|
||||
do { \
|
||||
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
|
||||
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
|
||||
A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
|
||||
A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
|
||||
#define SWAP_BC8 \
|
||||
@@ -321,60 +321,60 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0_8 do { \
|
||||
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_1_8 do { \
|
||||
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_2_8 do { \
|
||||
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define APPLY_P8 \
|
||||
@@ -398,42 +398,42 @@ do { \
|
||||
PERM_STEP_0_8; \
|
||||
PERM_STEP_1_8; \
|
||||
PERM_STEP_2_8; \
|
||||
A0B = _mm256_add_epi32( A0B, C6 ); \
|
||||
A0A = _mm256_add_epi32( A0A, C5 ); \
|
||||
A09 = _mm256_add_epi32( A09, C4 ); \
|
||||
A08 = _mm256_add_epi32( A08, C3 ); \
|
||||
A07 = _mm256_add_epi32( A07, C2 ); \
|
||||
A06 = _mm256_add_epi32( A06, C1 ); \
|
||||
A05 = _mm256_add_epi32( A05, C0 ); \
|
||||
A04 = _mm256_add_epi32( A04, CF ); \
|
||||
A03 = _mm256_add_epi32( A03, CE ); \
|
||||
A02 = _mm256_add_epi32( A02, CD ); \
|
||||
A01 = _mm256_add_epi32( A01, CC ); \
|
||||
A00 = _mm256_add_epi32( A00, CB ); \
|
||||
A0B = _mm256_add_epi32( A0B, CA ); \
|
||||
A0A = _mm256_add_epi32( A0A, C9 ); \
|
||||
A09 = _mm256_add_epi32( A09, C8 ); \
|
||||
A08 = _mm256_add_epi32( A08, C7 ); \
|
||||
A07 = _mm256_add_epi32( A07, C6 ); \
|
||||
A06 = _mm256_add_epi32( A06, C5 ); \
|
||||
A05 = _mm256_add_epi32( A05, C4 ); \
|
||||
A04 = _mm256_add_epi32( A04, C3 ); \
|
||||
A03 = _mm256_add_epi32( A03, C2 ); \
|
||||
A02 = _mm256_add_epi32( A02, C1 ); \
|
||||
A01 = _mm256_add_epi32( A01, C0 ); \
|
||||
A00 = _mm256_add_epi32( A00, CF ); \
|
||||
A0B = _mm256_add_epi32( A0B, CE ); \
|
||||
A0A = _mm256_add_epi32( A0A, CD ); \
|
||||
A09 = _mm256_add_epi32( A09, CC ); \
|
||||
A08 = _mm256_add_epi32( A08, CB ); \
|
||||
A07 = _mm256_add_epi32( A07, CA ); \
|
||||
A06 = _mm256_add_epi32( A06, C9 ); \
|
||||
A05 = _mm256_add_epi32( A05, C8 ); \
|
||||
A04 = _mm256_add_epi32( A04, C7 ); \
|
||||
A03 = _mm256_add_epi32( A03, C6 ); \
|
||||
A02 = _mm256_add_epi32( A02, C5 ); \
|
||||
A01 = _mm256_add_epi32( A01, C4 ); \
|
||||
A00 = _mm256_add_epi32( A00, C3 ); \
|
||||
AB = _mm256_add_epi32( AB, C6 ); \
|
||||
AA = _mm256_add_epi32( AA, C5 ); \
|
||||
A9 = _mm256_add_epi32( A9, C4 ); \
|
||||
A8 = _mm256_add_epi32( A8, C3 ); \
|
||||
A7 = _mm256_add_epi32( A7, C2 ); \
|
||||
A6 = _mm256_add_epi32( A6, C1 ); \
|
||||
A5 = _mm256_add_epi32( A5, C0 ); \
|
||||
A4 = _mm256_add_epi32( A4, CF ); \
|
||||
A3 = _mm256_add_epi32( A3, CE ); \
|
||||
A2 = _mm256_add_epi32( A2, CD ); \
|
||||
A1 = _mm256_add_epi32( A1, CC ); \
|
||||
A0 = _mm256_add_epi32( A0, CB ); \
|
||||
AB = _mm256_add_epi32( AB, CA ); \
|
||||
AA = _mm256_add_epi32( AA, C9 ); \
|
||||
A9 = _mm256_add_epi32( A9, C8 ); \
|
||||
A8 = _mm256_add_epi32( A8, C7 ); \
|
||||
A7 = _mm256_add_epi32( A7, C6 ); \
|
||||
A6 = _mm256_add_epi32( A6, C5 ); \
|
||||
A5 = _mm256_add_epi32( A5, C4 ); \
|
||||
A4 = _mm256_add_epi32( A4, C3 ); \
|
||||
A3 = _mm256_add_epi32( A3, C2 ); \
|
||||
A2 = _mm256_add_epi32( A2, C1 ); \
|
||||
A1 = _mm256_add_epi32( A1, C0 ); \
|
||||
A0 = _mm256_add_epi32( A0, CF ); \
|
||||
AB = _mm256_add_epi32( AB, CE ); \
|
||||
AA = _mm256_add_epi32( AA, CD ); \
|
||||
A9 = _mm256_add_epi32( A9, CC ); \
|
||||
A8 = _mm256_add_epi32( A8, CB ); \
|
||||
A7 = _mm256_add_epi32( A7, CA ); \
|
||||
A6 = _mm256_add_epi32( A6, C9 ); \
|
||||
A5 = _mm256_add_epi32( A5, C8 ); \
|
||||
A4 = _mm256_add_epi32( A4, C7 ); \
|
||||
A3 = _mm256_add_epi32( A3, C6 ); \
|
||||
A2 = _mm256_add_epi32( A2, C5 ); \
|
||||
A1 = _mm256_add_epi32( A1, C4 ); \
|
||||
A0 = _mm256_add_epi32( A0, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W8 do { \
|
||||
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
|
||||
#define DECL_STATE \
|
||||
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
|
||||
A08, A09, A0A, A0B; \
|
||||
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
|
||||
A8, A9, AA, AB; \
|
||||
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{ \
|
||||
if ( (state)->state_loaded ) \
|
||||
{ \
|
||||
A00 = (state)->A[0]; \
|
||||
A01 = (state)->A[1]; \
|
||||
A02 = (state)->A[2]; \
|
||||
A03 = (state)->A[3]; \
|
||||
A04 = (state)->A[4]; \
|
||||
A05 = (state)->A[5]; \
|
||||
A06 = (state)->A[6]; \
|
||||
A07 = (state)->A[7]; \
|
||||
A08 = (state)->A[8]; \
|
||||
A09 = (state)->A[9]; \
|
||||
A0A = (state)->A[10]; \
|
||||
A0B = (state)->A[11]; \
|
||||
A0 = (state)->A[0]; \
|
||||
A1 = (state)->A[1]; \
|
||||
A2 = (state)->A[2]; \
|
||||
A3 = (state)->A[3]; \
|
||||
A4 = (state)->A[4]; \
|
||||
A5 = (state)->A[5]; \
|
||||
A6 = (state)->A[6]; \
|
||||
A7 = (state)->A[7]; \
|
||||
A8 = (state)->A[8]; \
|
||||
A9 = (state)->A[9]; \
|
||||
AA = (state)->A[10]; \
|
||||
AB = (state)->A[11]; \
|
||||
B0 = (state)->B[0]; \
|
||||
B1 = (state)->B[1]; \
|
||||
B2 = (state)->B[2]; \
|
||||
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
else \
|
||||
{ \
|
||||
(state)->state_loaded = true; \
|
||||
A00 = m128_const1_64( 0x20728DFD20728DFD ); \
|
||||
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
|
||||
A02 = m128_const1_64( 0xE782B699E782B699 ); \
|
||||
A03 = m128_const1_64( 0x5530463255304632 ); \
|
||||
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
|
||||
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
|
||||
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
|
||||
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
|
||||
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
|
||||
A09 = m128_const1_64( 0x8BD144108BD14410 ); \
|
||||
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
|
||||
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
|
||||
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
|
||||
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
|
||||
A2 = m128_const1_64( 0xE782B699E782B699 ); \
|
||||
A3 = m128_const1_64( 0x5530463255304632 ); \
|
||||
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
|
||||
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
|
||||
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
|
||||
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
|
||||
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
|
||||
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
|
||||
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
|
||||
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
|
||||
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
|
||||
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
|
||||
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
|
||||
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE(state) do { \
|
||||
(state)->A[0] = A00; \
|
||||
(state)->A[1] = A01; \
|
||||
(state)->A[2] = A02; \
|
||||
(state)->A[3] = A03; \
|
||||
(state)->A[4] = A04; \
|
||||
(state)->A[5] = A05; \
|
||||
(state)->A[6] = A06; \
|
||||
(state)->A[7] = A07; \
|
||||
(state)->A[8] = A08; \
|
||||
(state)->A[9] = A09; \
|
||||
(state)->A[10] = A0A; \
|
||||
(state)->A[11] = A0B; \
|
||||
(state)->A[0] = A0; \
|
||||
(state)->A[1] = A1; \
|
||||
(state)->A[2] = A2; \
|
||||
(state)->A[3] = A3; \
|
||||
(state)->A[4] = A4; \
|
||||
(state)->A[5] = A5; \
|
||||
(state)->A[6] = A6; \
|
||||
(state)->A[7] = A7; \
|
||||
(state)->A[8] = A8; \
|
||||
(state)->A[9] = A9; \
|
||||
(state)->A[10] = AA; \
|
||||
(state)->A[11] = AB; \
|
||||
(state)->B[0] = B0; \
|
||||
(state)->B[1] = B1; \
|
||||
(state)->B[2] = B2; \
|
||||
@@ -884,8 +884,8 @@ do { \
|
||||
|
||||
#define XOR_W \
|
||||
do { \
|
||||
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
|
||||
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
|
||||
A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
|
||||
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -940,60 +940,60 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0 do { \
|
||||
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_1 do { \
|
||||
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_2 do { \
|
||||
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
|
||||
PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define APPLY_P \
|
||||
@@ -1017,42 +1017,42 @@ do { \
|
||||
PERM_STEP_0; \
|
||||
PERM_STEP_1; \
|
||||
PERM_STEP_2; \
|
||||
A0B = _mm_add_epi32( A0B, C6 ); \
|
||||
A0A = _mm_add_epi32( A0A, C5 ); \
|
||||
A09 = _mm_add_epi32( A09, C4 ); \
|
||||
A08 = _mm_add_epi32( A08, C3 ); \
|
||||
A07 = _mm_add_epi32( A07, C2 ); \
|
||||
A06 = _mm_add_epi32( A06, C1 ); \
|
||||
A05 = _mm_add_epi32( A05, C0 ); \
|
||||
A04 = _mm_add_epi32( A04, CF ); \
|
||||
A03 = _mm_add_epi32( A03, CE ); \
|
||||
A02 = _mm_add_epi32( A02, CD ); \
|
||||
A01 = _mm_add_epi32( A01, CC ); \
|
||||
A00 = _mm_add_epi32( A00, CB ); \
|
||||
A0B = _mm_add_epi32( A0B, CA ); \
|
||||
A0A = _mm_add_epi32( A0A, C9 ); \
|
||||
A09 = _mm_add_epi32( A09, C8 ); \
|
||||
A08 = _mm_add_epi32( A08, C7 ); \
|
||||
A07 = _mm_add_epi32( A07, C6 ); \
|
||||
A06 = _mm_add_epi32( A06, C5 ); \
|
||||
A05 = _mm_add_epi32( A05, C4 ); \
|
||||
A04 = _mm_add_epi32( A04, C3 ); \
|
||||
A03 = _mm_add_epi32( A03, C2 ); \
|
||||
A02 = _mm_add_epi32( A02, C1 ); \
|
||||
A01 = _mm_add_epi32( A01, C0 ); \
|
||||
A00 = _mm_add_epi32( A00, CF ); \
|
||||
A0B = _mm_add_epi32( A0B, CE ); \
|
||||
A0A = _mm_add_epi32( A0A, CD ); \
|
||||
A09 = _mm_add_epi32( A09, CC ); \
|
||||
A08 = _mm_add_epi32( A08, CB ); \
|
||||
A07 = _mm_add_epi32( A07, CA ); \
|
||||
A06 = _mm_add_epi32( A06, C9 ); \
|
||||
A05 = _mm_add_epi32( A05, C8 ); \
|
||||
A04 = _mm_add_epi32( A04, C7 ); \
|
||||
A03 = _mm_add_epi32( A03, C6 ); \
|
||||
A02 = _mm_add_epi32( A02, C5 ); \
|
||||
A01 = _mm_add_epi32( A01, C4 ); \
|
||||
A00 = _mm_add_epi32( A00, C3 ); \
|
||||
AB = _mm_add_epi32( AB, C6 ); \
|
||||
AA = _mm_add_epi32( AA, C5 ); \
|
||||
A9 = _mm_add_epi32( A9, C4 ); \
|
||||
A8 = _mm_add_epi32( A8, C3 ); \
|
||||
A7 = _mm_add_epi32( A7, C2 ); \
|
||||
A6 = _mm_add_epi32( A6, C1 ); \
|
||||
A5 = _mm_add_epi32( A5, C0 ); \
|
||||
A4 = _mm_add_epi32( A4, CF ); \
|
||||
A3 = _mm_add_epi32( A3, CE ); \
|
||||
A2 = _mm_add_epi32( A2, CD ); \
|
||||
A1 = _mm_add_epi32( A1, CC ); \
|
||||
A0 = _mm_add_epi32( A0, CB ); \
|
||||
AB = _mm_add_epi32( AB, CA ); \
|
||||
AA = _mm_add_epi32( AA, C9 ); \
|
||||
A9 = _mm_add_epi32( A9, C8 ); \
|
||||
A8 = _mm_add_epi32( A8, C7 ); \
|
||||
A7 = _mm_add_epi32( A7, C6 ); \
|
||||
A6 = _mm_add_epi32( A6, C5 ); \
|
||||
A5 = _mm_add_epi32( A5, C4 ); \
|
||||
A4 = _mm_add_epi32( A4, C3 ); \
|
||||
A3 = _mm_add_epi32( A3, C2 ); \
|
||||
A2 = _mm_add_epi32( A2, C1 ); \
|
||||
A1 = _mm_add_epi32( A1, C0 ); \
|
||||
A0 = _mm_add_epi32( A0, CF ); \
|
||||
AB = _mm_add_epi32( AB, CE ); \
|
||||
AA = _mm_add_epi32( AA, CD ); \
|
||||
A9 = _mm_add_epi32( A9, CC ); \
|
||||
A8 = _mm_add_epi32( A8, CB ); \
|
||||
A7 = _mm_add_epi32( A7, CA ); \
|
||||
A6 = _mm_add_epi32( A6, C9 ); \
|
||||
A5 = _mm_add_epi32( A5, C8 ); \
|
||||
A4 = _mm_add_epi32( A4, C7 ); \
|
||||
A3 = _mm_add_epi32( A3, C6 ); \
|
||||
A2 = _mm_add_epi32( A2, C5 ); \
|
||||
A1 = _mm_add_epi32( A1, C4 ); \
|
||||
A0 = _mm_add_epi32( A0, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W do { \
|
||||
|
@@ -10,6 +10,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "Verthash.h"
|
||||
#include "mm_malloc.h"
|
||||
#include "malloc-huge.h"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Verthash info management
|
||||
@@ -84,12 +85,18 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
|
||||
}
|
||||
|
||||
// Allocate data
|
||||
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
|
||||
if (!info->data)
|
||||
info->data = (uint8_t *)malloc_hugepages( fileSize );
|
||||
if ( info->data )
|
||||
if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
|
||||
else
|
||||
{
|
||||
fclose(fileMiningData);
|
||||
// Memory allocation fatal error.
|
||||
return 2;
|
||||
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
|
||||
if (!info->data)
|
||||
{
|
||||
fclose(fileMiningData);
|
||||
// Memory allocation fatal error.
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Load data
|
||||
|
@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
|
||||
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
|
||||
{
|
||||
// Theta
|
||||
bc[0] = _mm256_xor_si256( st[0],
|
||||
mm256_xor4( st[5], st[10], st[15], st[20] ) );
|
||||
bc[1] = _mm256_xor_si256( st[1],
|
||||
mm256_xor4( st[6], st[11], st[16], st[21] ) );
|
||||
bc[2] = _mm256_xor_si256( st[2],
|
||||
mm256_xor4( st[7], st[12], st[17], st[22] ) );
|
||||
bc[3] = _mm256_xor_si256( st[3],
|
||||
mm256_xor4( st[8], st[13], st[18], st[23] ) );
|
||||
bc[4] = _mm256_xor_si256( st[4],
|
||||
mm256_xor4( st[9], st[14], st[19], st[24] ) );
|
||||
bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
|
||||
bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
|
||||
bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
|
||||
bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
|
||||
bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );
|
||||
|
||||
for ( i = 0; i < 5; i++ )
|
||||
{
|
||||
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
|
||||
// Chi
|
||||
for ( j = 0; j < 25; j += 5 )
|
||||
{
|
||||
memcpy( bc, &st[ j ], 5*32 );
|
||||
st[ j ] = _mm256_xor_si256( st[ j ],
|
||||
_mm256_andnot_si256( bc[1], bc[2] ) );
|
||||
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
|
||||
_mm256_andnot_si256( bc[2], bc[3] ) );
|
||||
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
|
||||
_mm256_andnot_si256( bc[3], bc[4] ) );
|
||||
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
|
||||
_mm256_andnot_si256( bc[4], bc[0] ) );
|
||||
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
|
||||
_mm256_andnot_si256( bc[0], bc[1] ) );
|
||||
bc[0] = st[j];
|
||||
bc[1] = st[j+1];
|
||||
st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] );
|
||||
st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
|
||||
st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
|
||||
st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
|
||||
st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
|
||||
}
|
||||
|
||||
// Iota
|
||||
|
@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
|
||||
{
|
||||
opt_target_factor = 256.0;
|
||||
gate->scanhash = (void*)&scanhash_verthash;
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
|
||||
const char *verthash_data_file = opt_data_file ? opt_data_file
|
||||
: default_verthash_data_file;
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.19.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.19.2'
|
||||
PACKAGE_VERSION='3.19.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.19.3'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.19.2
|
||||
cpuminer-opt configure 3.19.3
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.19.2, which was
|
||||
It was created by cpuminer-opt $as_me 3.19.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.19.2'
|
||||
VERSION='3.19.3'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.19.2, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.19.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.19.2
|
||||
cpuminer-opt config.status 3.19.3
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.19.2])
|
||||
AC_INIT([cpuminer-opt], [3.19.3])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
36
malloc-huge.c
Normal file
36
malloc-huge.c
Normal file
@@ -0,0 +1,36 @@
|
||||
#include "malloc-huge.h"
|
||||
#include "miner.h"
|
||||
|
||||
#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024)
|
||||
|
||||
void *malloc_hugepages( size_t size )
|
||||
{
|
||||
#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
|
||||
// applog( LOG_WARNING, "Huge pages not available",size);
|
||||
return NULL;
|
||||
#else
|
||||
|
||||
if ( size < HUGEPAGE_MIN_ALLOC )
|
||||
{
|
||||
// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
|
||||
void *p = NULL;
|
||||
int flags =
|
||||
#ifdef MAP_NOCORE
|
||||
MAP_NOCORE |
|
||||
#endif
|
||||
MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
|
||||
|
||||
// round size up to next page boundary
|
||||
size = ( size + hugepage_mask ) & (~hugepage_mask);
|
||||
|
||||
p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
|
||||
if ( p == MAP_FAILED )
|
||||
p = NULL;
|
||||
return p;
|
||||
#endif
|
||||
}
|
||||
|
24
malloc-huge.h
Normal file
24
malloc-huge.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#if !(defined(MALLOC_HUGE__))
|
||||
#define MALLOC_HUGE__
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __unix__
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
|
||||
#if defined(MAP_HUGETLB)
|
||||
|
||||
// Minimum block size 6 MiB to use huge pages
|
||||
#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024)
|
||||
|
||||
#endif
|
||||
|
||||
// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
|
||||
void *malloc_hugepages( size_t size );
|
||||
|
||||
#endif
|
||||
|
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
./clean-all.sh || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
|
||||
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
|
||||
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
|
||||
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
|
||||
# AVX2 AES SHA: AMD Zen1
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2-sha.exe
|
||||
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
|
||||
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
# AVX AES: Intel Sandybridge, Ivybridge
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx.exe
|
||||
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
|
||||
# SSE4.2 AES: Intel Westmere
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
# Generic SSE2
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-sse2.exe
|
||||
make clean || echo clean
|
||||
|
||||
# Native with CPU groups ennabled
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
|
||||
|
Reference in New Issue
Block a user