Compare commits

..

2 Commits

Author SHA1 Message Date
Jay D Dee
17ccbc328f v3.19.3 2022-01-07 12:07:38 -05:00
Jay D Dee
0e3945ddb5 v3.19.2 2021-12-30 16:28:24 -05:00
20 changed files with 783 additions and 626 deletions

View File

@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
api.c \
sysinfos.c \
algo-gate-api.c\
malloc-huge.c \
algo/argon2/argon2a/argon2a.c \
algo/argon2/argon2a/ar2/argon2.c \
algo/argon2/argon2a/ar2/opt.c \

View File

@@ -65,6 +65,25 @@ If not what makes it happen or not happen?
Change Log
----------
v3.19.3
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
v3.19.2
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
Reduce log noise when replies to submitted shares are lost due to stratum errors.
Fugue prehash optimization for X16r family AVX2 & AVX512.
Small speed improvement for Hamsi AVX2 & AVX512.
Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
v3.19.1
Changes to Windows binaries package:

View File

@@ -37,6 +37,13 @@
#if defined(__AVX512F__)
static inline __m512i blamka( __m512i x, __m512i y )
{
__m512i xy = _mm512_mul_epu32( x, y );
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
_mm512_add_epi64( xy, xy ) );
}
static void fill_block( __m512i *state, const block *ref_block,
block *next_block, int with_xor )
{

View File

@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
static __m512i muladd(__m512i x, __m512i y)
static inline __m512i muladd(__m512i x, __m512i y)
{
__m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ROR64(D0, 32); \
D1 = ROR64(D1, 32); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ROR64(B0, 24); \
B1 = ROR64(B1, 24); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ROR64(D0, 16); \
D1 = ROR64(D1, 16); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ROR64(B0, 63); \
B1 = ROR64(B1, 63); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
#define SWAP_HALVES(A0, A1) \
do { \
__m512i t0, t1; \
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t0; \
A1 = t1; \
__m512i t; \
t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t; \
} while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \

View File

@@ -37,12 +37,23 @@ typedef struct
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
#define fugue512_update( state, data, len ) \
fugue512_Update( state, data, (len)<<3 )
#define fugue512_final \
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
#endif // AES

View File

@@ -545,31 +545,33 @@ static const sph_u32 T512[64][16] = {
#define sE c7
#define sF m7
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way AVX512
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
// on i9-9940x cmplt with zero was 3% faster than movepi.
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
const __m512i zero = m512_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
dm = mm512_negate_32( _mm512_or_si512( dm, \
_mm512_slli_epi64( dm, 32 ) ) ); \
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -609,199 +611,192 @@ do { \
#define READ_STATE_BIG8(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG8(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
#define ROUND_BIG8( alpha ) \
do { \
__m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
sA = _mm512_xor_si512( sA, alpha[10] ); \
sB = _mm512_xor_si512( sB, alpha[11] ); \
sC = _mm512_xor_si512( sC, alpha[12] ); \
sD = _mm512_xor_si512( sD, alpha[13] ); \
sE = _mm512_xor_si512( sE, alpha[14] ); \
sF = _mm512_xor_si512( sF, alpha[15] ); \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
\
SBOX8( s0, s4, s8, sC ); \
SBOX8( s1, s5, s9, sD ); \
SBOX8( s2, s6, sA, sE ); \
SBOX8( s3, s7, sB, sF ); \
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
_mm512_bslli_epi128( s5, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
_mm512_bslli_epi128( sE, 4 ) ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t1, s9, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( s6, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
_mm512_bslli_epi128( sF, 4 ) ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
L8( s1, t1, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
_mm512_bslli_epi128( s7, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
_mm512_bslli_epi128( sC, 4 ) ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t1, sB, t3 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
_mm512_bslli_epi128( s4, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
L8( s3, t1, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
_mm512_bslli_epi128( sB, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
L8( t0, t1, t2, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
} while (0)
#define P_BIG8 \
do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define PF_BIG8 \
do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define T_BIG8 \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
} while (0)
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -838,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
{
sc->partial_len = 0;
@@ -888,13 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
#define INPUT_BIG \
do { \
__m256i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 63; u >= 0; u-- ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -912,7 +905,6 @@ do { \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \
tp += 8; \
db = _mm256_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -961,47 +953,28 @@ do { \
#define READ_STATE_BIG(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
/*
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
*/
#define ROUND_BIG( alpha ) \
do { \
__m256i t0, t1, t2, t3; \
@@ -1027,151 +1000,145 @@ do { \
SBOX( s2, s6, sA, sE ); \
SBOX( s3, s7, sB, sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
L( s0, t1, s9, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
L( s1, t1, sA, t3 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
L( s2, t1, sB, t3 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
L( s3, t1, s8, t3 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
\
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
t3 = mm256_swap64_32( t3 ); \
L( t0, t1, t2, t3 ); \
t3 = mm256_swap64_32( t3 ); \
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
\
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
L( t0, t1, t2, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
} while (0)
#define P_BIG \
do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define PF_BIG \
do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define T_BIG \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
} while (0)
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )

View File

@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
#define WRITE_STATE(sc)
#define MOV64(d, s) (d = s)
#define XOR64_IOTA XOR64
#define XOR64_IOTA XOR
#define LPAR (
#define RPAR )
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
// Targetted macros, keccak-macros.h is included for each target.
#define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX512
// AVX2
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
} while (0)
#define DECL64(x) __m256i x
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX2

View File

@@ -1,6 +1,19 @@
#ifdef TH_ELT
#undef TH_ELT
#endif
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
XOR3( tt0, d0, d1, d4 ); \
XOR( tt1, d2, d3 ); \
XOR( tt0, tt0, tt1 ); \
ROL64( tt0, tt0, 1 ); \
XOR3( tt1, c0, c1, c4 ); \
XOR3( tt0, tt0, c2, c3 ); \
XOR( t, tt0, tt1 ); \
} while (0)
/*
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
@@ -17,7 +30,7 @@
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
*/
#ifdef THETA
#undef THETA
#endif

View File

@@ -34,6 +34,7 @@
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#include <mm_malloc.h>
#include "malloc-huge.h"
static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
bool scrypt_miner_thread_init( int thr_id )
{
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
scratchbuf = malloc_hugepages( scratchbuf_size );
if ( scratchbuf )
return true;
{
if ( opt_debug )
applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
}
else
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
if ( scratchbuf ) return true;
applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
return false;
return false;
}
bool register_scrypt_algo( algo_gate_t* gate )
@@ -1544,7 +1553,6 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &t_size, t_units );
format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );

View File

@@ -62,8 +62,8 @@ extern "C"{
#if defined(__AVX2__)
#define DECL_STATE8 \
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m256i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -78,18 +78,18 @@ extern "C"{
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -126,18 +126,18 @@ extern "C"{
else \
{ \
(state)->state_loaded = true; \
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m256_const1_64( 0xE782B699E782B699 ); \
A03 = m256_const1_64( 0x5530463255304632 ); \
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m256_const1_64( 0xE782B699E782B699 ); \
A3 = m256_const1_64( 0x5530463255304632 ); \
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -176,18 +176,18 @@ extern "C"{
} while (0)
#define WRITE_STATE8(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -286,8 +286,8 @@ do { \
#define XOR_W8 \
do { \
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
} while (0)
#define SWAP_BC8 \
@@ -321,60 +321,60 @@ do { \
} while (0)
#define PERM_STEP_0_8 do { \
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1_8 do { \
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2_8 do { \
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P8 \
@@ -398,42 +398,42 @@ do { \
PERM_STEP_0_8; \
PERM_STEP_1_8; \
PERM_STEP_2_8; \
A0B = _mm256_add_epi32( A0B, C6 ); \
A0A = _mm256_add_epi32( A0A, C5 ); \
A09 = _mm256_add_epi32( A09, C4 ); \
A08 = _mm256_add_epi32( A08, C3 ); \
A07 = _mm256_add_epi32( A07, C2 ); \
A06 = _mm256_add_epi32( A06, C1 ); \
A05 = _mm256_add_epi32( A05, C0 ); \
A04 = _mm256_add_epi32( A04, CF ); \
A03 = _mm256_add_epi32( A03, CE ); \
A02 = _mm256_add_epi32( A02, CD ); \
A01 = _mm256_add_epi32( A01, CC ); \
A00 = _mm256_add_epi32( A00, CB ); \
A0B = _mm256_add_epi32( A0B, CA ); \
A0A = _mm256_add_epi32( A0A, C9 ); \
A09 = _mm256_add_epi32( A09, C8 ); \
A08 = _mm256_add_epi32( A08, C7 ); \
A07 = _mm256_add_epi32( A07, C6 ); \
A06 = _mm256_add_epi32( A06, C5 ); \
A05 = _mm256_add_epi32( A05, C4 ); \
A04 = _mm256_add_epi32( A04, C3 ); \
A03 = _mm256_add_epi32( A03, C2 ); \
A02 = _mm256_add_epi32( A02, C1 ); \
A01 = _mm256_add_epi32( A01, C0 ); \
A00 = _mm256_add_epi32( A00, CF ); \
A0B = _mm256_add_epi32( A0B, CE ); \
A0A = _mm256_add_epi32( A0A, CD ); \
A09 = _mm256_add_epi32( A09, CC ); \
A08 = _mm256_add_epi32( A08, CB ); \
A07 = _mm256_add_epi32( A07, CA ); \
A06 = _mm256_add_epi32( A06, C9 ); \
A05 = _mm256_add_epi32( A05, C8 ); \
A04 = _mm256_add_epi32( A04, C7 ); \
A03 = _mm256_add_epi32( A03, C6 ); \
A02 = _mm256_add_epi32( A02, C5 ); \
A01 = _mm256_add_epi32( A01, C4 ); \
A00 = _mm256_add_epi32( A00, C3 ); \
AB = _mm256_add_epi32( AB, C6 ); \
AA = _mm256_add_epi32( AA, C5 ); \
A9 = _mm256_add_epi32( A9, C4 ); \
A8 = _mm256_add_epi32( A8, C3 ); \
A7 = _mm256_add_epi32( A7, C2 ); \
A6 = _mm256_add_epi32( A6, C1 ); \
A5 = _mm256_add_epi32( A5, C0 ); \
A4 = _mm256_add_epi32( A4, CF ); \
A3 = _mm256_add_epi32( A3, CE ); \
A2 = _mm256_add_epi32( A2, CD ); \
A1 = _mm256_add_epi32( A1, CC ); \
A0 = _mm256_add_epi32( A0, CB ); \
AB = _mm256_add_epi32( AB, CA ); \
AA = _mm256_add_epi32( AA, C9 ); \
A9 = _mm256_add_epi32( A9, C8 ); \
A8 = _mm256_add_epi32( A8, C7 ); \
A7 = _mm256_add_epi32( A7, C6 ); \
A6 = _mm256_add_epi32( A6, C5 ); \
A5 = _mm256_add_epi32( A5, C4 ); \
A4 = _mm256_add_epi32( A4, C3 ); \
A3 = _mm256_add_epi32( A3, C2 ); \
A2 = _mm256_add_epi32( A2, C1 ); \
A1 = _mm256_add_epi32( A1, C0 ); \
A0 = _mm256_add_epi32( A0, CF ); \
AB = _mm256_add_epi32( AB, CE ); \
AA = _mm256_add_epi32( AA, CD ); \
A9 = _mm256_add_epi32( A9, CC ); \
A8 = _mm256_add_epi32( A8, CB ); \
A7 = _mm256_add_epi32( A7, CA ); \
A6 = _mm256_add_epi32( A6, C9 ); \
A5 = _mm256_add_epi32( A5, C8 ); \
A4 = _mm256_add_epi32( A4, C7 ); \
A3 = _mm256_add_epi32( A3, C6 ); \
A2 = _mm256_add_epi32( A2, C5 ); \
A1 = _mm256_add_epi32( A1, C4 ); \
A0 = _mm256_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W8 do { \
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \
{ \
(state)->state_loaded = true; \
A00 = m128_const1_64( 0x20728DFD20728DFD ); \
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m128_const1_64( 0xE782B699E782B699 ); \
A03 = m128_const1_64( 0x5530463255304632 ); \
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m128_const1_64( 0x8BD144108BD14410 ); \
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m128_const1_64( 0xE782B699E782B699 ); \
A3 = m128_const1_64( 0x5530463255304632 ); \
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
} while (0)
#define WRITE_STATE(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -884,8 +884,8 @@ do { \
#define XOR_W \
do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
} while (0)
@@ -940,60 +940,60 @@ do { \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P \
@@ -1017,42 +1017,42 @@ do { \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = _mm_add_epi32( A0B, C6 ); \
A0A = _mm_add_epi32( A0A, C5 ); \
A09 = _mm_add_epi32( A09, C4 ); \
A08 = _mm_add_epi32( A08, C3 ); \
A07 = _mm_add_epi32( A07, C2 ); \
A06 = _mm_add_epi32( A06, C1 ); \
A05 = _mm_add_epi32( A05, C0 ); \
A04 = _mm_add_epi32( A04, CF ); \
A03 = _mm_add_epi32( A03, CE ); \
A02 = _mm_add_epi32( A02, CD ); \
A01 = _mm_add_epi32( A01, CC ); \
A00 = _mm_add_epi32( A00, CB ); \
A0B = _mm_add_epi32( A0B, CA ); \
A0A = _mm_add_epi32( A0A, C9 ); \
A09 = _mm_add_epi32( A09, C8 ); \
A08 = _mm_add_epi32( A08, C7 ); \
A07 = _mm_add_epi32( A07, C6 ); \
A06 = _mm_add_epi32( A06, C5 ); \
A05 = _mm_add_epi32( A05, C4 ); \
A04 = _mm_add_epi32( A04, C3 ); \
A03 = _mm_add_epi32( A03, C2 ); \
A02 = _mm_add_epi32( A02, C1 ); \
A01 = _mm_add_epi32( A01, C0 ); \
A00 = _mm_add_epi32( A00, CF ); \
A0B = _mm_add_epi32( A0B, CE ); \
A0A = _mm_add_epi32( A0A, CD ); \
A09 = _mm_add_epi32( A09, CC ); \
A08 = _mm_add_epi32( A08, CB ); \
A07 = _mm_add_epi32( A07, CA ); \
A06 = _mm_add_epi32( A06, C9 ); \
A05 = _mm_add_epi32( A05, C8 ); \
A04 = _mm_add_epi32( A04, C7 ); \
A03 = _mm_add_epi32( A03, C6 ); \
A02 = _mm_add_epi32( A02, C5 ); \
A01 = _mm_add_epi32( A01, C4 ); \
A00 = _mm_add_epi32( A00, C3 ); \
AB = _mm_add_epi32( AB, C6 ); \
AA = _mm_add_epi32( AA, C5 ); \
A9 = _mm_add_epi32( A9, C4 ); \
A8 = _mm_add_epi32( A8, C3 ); \
A7 = _mm_add_epi32( A7, C2 ); \
A6 = _mm_add_epi32( A6, C1 ); \
A5 = _mm_add_epi32( A5, C0 ); \
A4 = _mm_add_epi32( A4, CF ); \
A3 = _mm_add_epi32( A3, CE ); \
A2 = _mm_add_epi32( A2, CD ); \
A1 = _mm_add_epi32( A1, CC ); \
A0 = _mm_add_epi32( A0, CB ); \
AB = _mm_add_epi32( AB, CA ); \
AA = _mm_add_epi32( AA, C9 ); \
A9 = _mm_add_epi32( A9, C8 ); \
A8 = _mm_add_epi32( A8, C7 ); \
A7 = _mm_add_epi32( A7, C6 ); \
A6 = _mm_add_epi32( A6, C5 ); \
A5 = _mm_add_epi32( A5, C4 ); \
A4 = _mm_add_epi32( A4, C3 ); \
A3 = _mm_add_epi32( A3, C2 ); \
A2 = _mm_add_epi32( A2, C1 ); \
A1 = _mm_add_epi32( A1, C0 ); \
A0 = _mm_add_epi32( A0, CF ); \
AB = _mm_add_epi32( AB, CE ); \
AA = _mm_add_epi32( AA, CD ); \
A9 = _mm_add_epi32( A9, CC ); \
A8 = _mm_add_epi32( A8, CB ); \
A7 = _mm_add_epi32( A7, CA ); \
A6 = _mm_add_epi32( A6, C9 ); \
A5 = _mm_add_epi32( A5, C8 ); \
A4 = _mm_add_epi32( A4, C7 ); \
A3 = _mm_add_epi32( A3, C6 ); \
A2 = _mm_add_epi32( A2, C5 ); \
A1 = _mm_add_epi32( A1, C4 ); \
A0 = _mm_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W do { \

View File

@@ -10,6 +10,7 @@
#include "algo-gate-api.h"
#include "Verthash.h"
#include "mm_malloc.h"
#include "malloc-huge.h"
//-----------------------------------------------------------------------------
// Verthash info management
@@ -84,12 +85,18 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
}
// Allocate data
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
if (!info->data)
info->data = (uint8_t *)malloc_hugepages( fileSize );
if ( info->data )
if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
else
{
fclose(fileMiningData);
// Memory allocation fatal error.
return 2;
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
if (!info->data)
{
fclose(fileMiningData);
// Memory allocation fatal error.
return 2;
}
}
// Load data

View File

@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
{
// Theta
bc[0] = _mm256_xor_si256( st[0],
mm256_xor4( st[5], st[10], st[15], st[20] ) );
bc[1] = _mm256_xor_si256( st[1],
mm256_xor4( st[6], st[11], st[16], st[21] ) );
bc[2] = _mm256_xor_si256( st[2],
mm256_xor4( st[7], st[12], st[17], st[22] ) );
bc[3] = _mm256_xor_si256( st[3],
mm256_xor4( st[8], st[13], st[18], st[23] ) );
bc[4] = _mm256_xor_si256( st[4],
mm256_xor4( st[9], st[14], st[19], st[24] ) );
bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );
for ( i = 0; i < 5; i++ )
{
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
// Chi
for ( j = 0; j < 25; j += 5 )
{
memcpy( bc, &st[ j ], 5*32 );
st[ j ] = _mm256_xor_si256( st[ j ],
_mm256_andnot_si256( bc[1], bc[2] ) );
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
_mm256_andnot_si256( bc[2], bc[3] ) );
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
_mm256_andnot_si256( bc[3], bc[4] ) );
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
_mm256_andnot_si256( bc[4], bc[0] ) );
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
_mm256_andnot_si256( bc[0], bc[1] ) );
bc[0] = st[j];
bc[1] = st[j+1];
st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] );
st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
}
// Iota

View File

@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -60,7 +60,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16r_ctx.hamsi );
hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
mm128_bswap32_80( edata, pdata );
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -306,7 +313,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -319,14 +326,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
hash7, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in4 + 76, 4 );
fugue512_final( &ctx.fugue, hash4 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in5 + 76, 4 );
fugue512_final( &ctx.fugue, hash5 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in6 + 76, 4 );
fugue512_final( &ctx.fugue, hash6 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in7 + 76, 4 );
fugue512_final( &ctx.fugue, hash7 );
}
else
{
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
}
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -347,25 +383,25 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
{
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
}
@@ -532,7 +568,13 @@ void x16r_4way_prehash( void *vdata, void *pdata )
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16r_ctx.hamsi );
hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
mm128_bswap32_80( edata, pdata );
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm128_bswap32_intrlv80_4x32( vdata2, pdata );
@@ -734,7 +776,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -745,10 +787,27 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
}
else
{
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
}
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.19.1'
PACKAGE_STRING='cpuminer-opt 3.19.1'
PACKAGE_VERSION='3.19.3'
PACKAGE_STRING='cpuminer-opt 3.19.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.19.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.19.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.19.1
cpuminer-opt configure 3.19.3
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.19.1, which was
It was created by cpuminer-opt $as_me 3.19.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.19.1'
VERSION='3.19.3'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.19.1, which was
This file was extended by cpuminer-opt $as_me 3.19.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.19.1
cpuminer-opt config.status 3.19.3
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.19.1])
AC_INIT([cpuminer-opt], [3.19.3])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -224,7 +224,11 @@ static uint8_t thread_affinity_map[ max_cpus ];
// display affinity mask graphically
static void format_affinity_mask( char *mask_str, uint64_t mask )
{
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
int n = num_cpus / num_cpugroups;
#else
int n = num_cpus < 64 ? num_cpus : 64;
#endif
int i;
for ( i = 0; i < n; i++ )
{
@@ -2164,7 +2168,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
} // !quiet
} // new diff/block
if ( new_job && !opt_quiet )
if ( new_job && !( opt_quiet || stratum_errors ) )
{
int mismatch = submitted_share_count - ( accepted_share_count
+ stale_share_count
@@ -3609,7 +3613,9 @@ int main(int argc, char *argv[])
num_cpus = 1;
#endif
if ( num_cpus < 1 ) num_cpus = 1;
if ( num_cpus < 1 )
num_cpus = 1;
opt_n_threads = num_cpus;
parse_cmdline( argc, argv );
@@ -3745,9 +3751,6 @@ int main(int argc, char *argv[])
}
#endif
if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) )
opt_n_threads = num_cpus;
if ( opt_affinity && num_cpus > max_cpus )
{
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",

36
malloc-huge.c Normal file
View File

@@ -0,0 +1,36 @@
#include "malloc-huge.h"
#include "miner.h"
#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024)
void *malloc_hugepages( size_t size )
{
#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
// applog( LOG_WARNING, "Huge pages not available",size);
return NULL;
#else
if ( size < HUGEPAGE_MIN_ALLOC )
{
// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
return NULL;
}
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
void *p = NULL;
int flags =
#ifdef MAP_NOCORE
MAP_NOCORE |
#endif
MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
// round size up to next page boundary
size = ( size + hugepage_mask ) & (~hugepage_mask);
p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
if ( p == MAP_FAILED )
p = NULL;
return p;
#endif
}

24
malloc-huge.h Normal file
View File

@@ -0,0 +1,24 @@
#if !(defined(MALLOC_HUGE__))
#define MALLOC_HUGE__
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#ifdef __unix__
#include <sys/mman.h>
#endif
#if defined(MAP_HUGETLB)
// Minimum block size 6 MiB to use huge pages
#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024)
#endif
// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
void *malloc_hugepages( size_t size );
#endif

View File

@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
./clean-all.sh || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
make clean || echo done
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
# AVX2 AES SHA: AMD Zen1
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2.exe
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
# AVX AES: Intel Sandybridge, Ivybridge
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS
CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx.exe
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
# SSE4.2 AES: Intel Westmere
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
# Generic SSE2
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe