This commit is contained in:
Jay D Dee
2023-09-28 18:43:18 -04:00
parent be88afc349
commit bc5a5c6df8
88 changed files with 5526 additions and 3361 deletions

View File

@@ -6,13 +6,12 @@
#include "sha256-hash.h"
#include "compat.h"
/*
static const uint32_t H256[8] =
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
*/
static const uint32_t K256[64] =
{
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i T1, T2; \
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
__m128i K = v128_32( K256[( (j)+(i) )] ); \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
K, W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
T0 = _mm_add_epi32( v128_32( K256[58] ),
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = _mm_add_epi32( B, T0 );
T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
T1 = _mm_add_epi32( v128_32( K256[59] ),
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = _mm_add_epi32( A, T1 );
T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
T2 = _mm_add_epi32( v128_32( K256[60] ),
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = _mm_add_epi32( H, T2 );
targ = _mm_set1_epi32( target[7] );
targ = v128_32( target[7] );
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
// round 61 part 1
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
T0 = _mm_add_epi32( v128_32( K256[61] ),
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = _mm_add_epi32( G, T0 );
if ( t6_mask )
{
targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
if ( ( 0 != ( t6_mask & mm128_movmask_32(
@@ -440,14 +439,14 @@ return 1;
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
sc->val[0] = v128_32( sha256_iv[0] );
sc->val[1] = v128_32( sha256_iv[1] );
sc->val[2] = v128_32( sha256_iv[2] );
sc->val[3] = v128_32( sha256_iv[3] );
sc->val[4] = v128_32( sha256_iv[4] );
sc->val[5] = v128_32( sha256_iv[5] );
sc->val[6] = v128_32( sha256_iv[6] );
sc->val[7] = v128_32( sha256_iv[7] );
}
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
@@ -614,7 +613,7 @@ do { \
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
@@ -634,7 +633,7 @@ do { \
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \
@@ -643,7 +642,7 @@ do { \
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
@@ -666,7 +665,7 @@ do { \
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
W[ i0 ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
@@ -677,7 +676,7 @@ do { \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
\
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
W[ (i1) ] ); \
T1 = BSG2_1x( D ); \
T2 = BSG2_0x( H ); \
@@ -790,7 +789,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
// round 3 part 1, avoid nonces W[3]
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
_mm256_set1_epi32( K256[3] ) );
v256_32( K256[3] ) );
A = _mm256_add_epi32( A, T1 );
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
MAJx(F, G, H) ) );
@@ -910,12 +909,11 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target )
{
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
int flip;
int t6_mask;
__m256i vmask, targ, hash;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
uint8_t flip, t6_mask;
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in+1 );
@@ -991,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// round 58 to 60 part 1
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
T0 = _mm256_add_epi32( v256_32( K256[58] ),
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
B = _mm256_add_epi32( B, T0 );
T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
T1 = _mm256_add_epi32( v256_32( K256[59] ),
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
A = _mm256_add_epi32( A, T1 );
T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
T2 = _mm256_add_epi32( v256_32( K256[60] ),
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
H = _mm256_add_epi32( H, T2 );
// Got H, test it.
targ = _mm256_set1_epi32( target[7] );
targ = v256_32( target[7] );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^
if ( target[7] )
{
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
return 0;
return 0;
}
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
// round 58 part 2
@@ -1018,14 +1018,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
MAJx( G, H, A ) ) );
// round 61 part 1
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
T0 = _mm256_add_epi32( v256_32( K256[61] ),
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
G = _mm256_add_epi32( G, T0 );
if ( t6_mask )
{
// Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
@@ -1078,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
void sha256_8way_init( sha256_8way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sc->val[0] = v256_32( sha256_iv[0] );
sc->val[1] = v256_32( sha256_iv[1] );
sc->val[2] = v256_32( sha256_iv[2] );
sc->val[3] = v256_32( sha256_iv[3] );
sc->val[4] = v256_32( sha256_iv[4] );
sc->val[5] = v256_32( sha256_iv[5] );
sc->val[6] = v256_32( sha256_iv[6] );
sc->val[7] = v256_32( sha256_iv[7] );
}
// need to handle odd byte length for yespower.
@@ -1131,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1147,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
@@ -1210,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
__m512i T1 = BSG2_1x16( E ); \
__m512i T2 = BSG2_0x16( A ); \
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
@@ -1224,7 +1224,7 @@ do { \
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
v512_32( K256[(i)+(j)] ) ); \
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
@@ -1234,7 +1234,7 @@ do { \
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
__m512i K = v512_32( K256[( (j)+(i) )] ); \
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
@@ -1345,7 +1345,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
// round 3 part 1, avoid nonces W[3]
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
_mm512_set1_epi32( K256[3] ) );
v512_32( K256[3] ) );
A = _mm512_add_epi32( A, T1 );
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
MAJx16(F, G, H) ) );
@@ -1566,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// rounds 58 to 60 part 1
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
T0 = _mm512_add_epi32( v512_32( K256[58] ),
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
B = _mm512_add_epi32( B, T0 );
T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
T1 = _mm512_add_epi32( v512_32( K256[59] ),
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
A = _mm512_add_epi32( A, T1 );
T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
T2 = _mm512_add_epi32( v512_32( K256[60] ),
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
H = _mm512_add_epi32( H, T2 );
// got H, test it against target[7]
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
targ = _mm512_set1_epi32( target[7] );
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
targ = v512_32( target[7] );
if ( target[7] )
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
return 0;
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
@@ -1591,15 +1592,15 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// round 61 part 1
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
T0 = _mm512_add_epi32( v512_32( K256[61] ),
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
G = _mm512_add_epi32( G, T0 );
// got G, test it against target[6] if indicated
if ( t6_mask != 0 )
if ( (uint16_t)t6_mask )
{
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
targ = _mm512_set1_epi32( target[6] );
targ = v512_32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
return 0;
}
@@ -1637,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
void sha256_16way_init( sha256_16way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sc->val[0] = v512_32( sha256_iv[0] );
sc->val[1] = v512_32( sha256_iv[1] );
sc->val[2] = v512_32( sha256_iv[2] );
sc->val[3] = v512_32( sha256_iv[3] );
sc->val[4] = v512_32( sha256_iv[4] );
sc->val[5] = v512_32( sha256_iv[5] );
sc->val[6] = v512_32( sha256_iv[6] );
sc->val[7] = v512_32( sha256_iv[7] );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1688,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1704,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );

File diff suppressed because it is too large Load Diff

View File

@@ -118,10 +118,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -130,42 +130,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
istate[0] = v512_32( sha256_iv[0] );
istate[1] = v512_32( sha256_iv[1] );
istate[2] = v512_32( sha256_iv[2] );
istate[3] = v512_32( sha256_iv[3] );
istate[4] = v512_32( sha256_iv[4] );
istate[5] = v512_32( sha256_iv[5] );
istate[6] = v512_32( sha256_iv[6] );
istate[7] = v512_32( sha256_iv[7] );
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
block[15] = v512_32( 32*8 ); // bit count
do
{
@@ -216,33 +216,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 80*8 );
vdata[16+15] = v256_32( 80*8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 );
block[15] = v256_32( 32*8 );
// initialize state for second hash
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
istate[0] = v256_32( sha256_iv[0] );
istate[1] = v256_32( sha256_iv[1] );
istate[2] = v256_32( sha256_iv[2] );
istate[3] = v256_32( sha256_iv[3] );
istate[4] = v256_32( sha256_iv[4] );
istate[5] = v256_32( sha256_iv[5] );
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -298,31 +298,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 );
vdata[16+15] = v128_32( 80*8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 );
block[15] = v128_32( 32*8 );
// initialize state
istate[0] = _mm_set1_epi32( sha256_iv[0] );
istate[1] = _mm_set1_epi32( sha256_iv[1] );
istate[2] = _mm_set1_epi32( sha256_iv[2] );
istate[3] = _mm_set1_epi32( sha256_iv[3] );
istate[4] = _mm_set1_epi32( sha256_iv[4] );
istate[5] = _mm_set1_epi32( sha256_iv[5] );
istate[6] = _mm_set1_epi32( sha256_iv[6] );
istate[7] = _mm_set1_epi32( sha256_iv[7] );
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );

View File

@@ -51,8 +51,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 3] = block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
@@ -128,10 +127,10 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -140,42 +139,42 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256dt_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
buf[15] = v512_32( 0x480 ); // sha256dt funky bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
istate[0] = v512_32( sha256dt_iv[0] );
istate[1] = v512_32( sha256dt_iv[1] );
istate[2] = v512_32( sha256dt_iv[2] );
istate[3] = v512_32( sha256dt_iv[3] );
istate[4] = v512_32( sha256dt_iv[4] );
istate[5] = v512_32( sha256dt_iv[5] );
istate[6] = v512_32( sha256dt_iv[6] );
istate[7] = v512_32( sha256dt_iv[7] );
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
block[15] = v512_32( 0x300 ); // bit count
do
{
@@ -226,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 0x480 );
vdata[16+15] = v256_32( 0x480 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 0x300 );
block[15] = v256_32( 0x300 );
// initialize state for swecond hash
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
// initialize state for second hash
istate[0] = v256_32( sha256dt_iv[0] );
istate[1] = v256_32( sha256dt_iv[1] );
istate[2] = v256_32( sha256dt_iv[2] );
istate[3] = v256_32( sha256dt_iv[3] );
istate[4] = v256_32( sha256dt_iv[4] );
istate[5] = v256_32( sha256dt_iv[5] );
istate[6] = v256_32( sha256dt_iv[6] );
istate[7] = v256_32( sha256dt_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -308,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 0x480 );
vdata[16+15] = v128_32( 0x480 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 0x300 );
block[15] = v128_32( 0x300 );
// initialize state
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
initstate[0] = v128_32( sha256dt_iv[0] );
initstate[1] = v128_32( sha256dt_iv[1] );
initstate[2] = v128_32( sha256dt_iv[2] );
initstate[3] = v128_32( sha256dt_iv[3] );
initstate[4] = v128_32( sha256dt_iv[4] );
initstate[5] = v128_32( sha256dt_iv[5] );
initstate[6] = v128_32( sha256dt_iv[6] );
initstate[7] = v128_32( sha256dt_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );
@@ -342,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( hash32, block, initstate );
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
// {
mm128_block_bswap_32( hash32, hash32 );
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
// }
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );

View File

@@ -30,10 +30,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -42,42 +42,42 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
istate[0] = v512_32( sha256_iv[0] );
istate[1] = v512_32( sha256_iv[1] );
istate[2] = v512_32( sha256_iv[2] );
istate[3] = v512_32( sha256_iv[3] );
istate[4] = v512_32( sha256_iv[4] );
istate[5] = v512_32( sha256_iv[5] );
istate[6] = v512_32( sha256_iv[6] );
istate[7] = v512_32( sha256_iv[7] );
// initialize padding for 2nd & 3rd sha256
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
block[15] = v512_32( 32*8 ); // bit count
do
{
@@ -222,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
vdata[16+15] = v256_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
block[15] = v256_32( 32*8 ); // bit count
// initialize state
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = v256_32( sha256_iv[0] );
istate[1] = v256_32( sha256_iv[1] );
istate[2] = v256_32( sha256_iv[2] );
istate[3] = v256_32( sha256_iv[3] );
istate[4] = v256_32( sha256_iv[4] );
istate[5] = v256_32( sha256_iv[5] );
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -313,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
vdata[16+15] = v128_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 ); // bit count
block[15] = v128_32( 32*8 ); // bit count
// initialize state
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );

View File

@@ -39,57 +39,429 @@
/*
static const uit64_t H512[8] =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
*/
static const uint64_t K512[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD,
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019,
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE,
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910,
0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493,
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#if defined(__AVX2__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension.
// Experimental. Not tested. Not reviewed. Compile tested only.
// Needs GCC-13 for compilation.
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
// Modelled after noloader sha256 implementation.
// It's not clear how SHA512 will be supported before AVX10 considering how
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
// until AVX10-256.
#if defined(__AVX512VL__)
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
#else
// Ugly workaround to make it work with AVX2
static const __m256i mask __attribute__ ((aligned (32)))
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
#define mm256_alignr_1x64( v1, v0 ) \
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
#endif
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) )
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
// Save initial state
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
// Save state
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
}
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
// Save initial state
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
// Save state
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -117,7 +489,7 @@ static const uint64_t K512[80] =
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
__m512i T0 = _mm512_add_epi64( v512_64( K512[i] ), W[ i ] ); \
__m512i T1 = BSG8W_5_1( E ); \
__m512i T2 = BSG8W_5_0( A ); \
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
@@ -155,14 +527,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
}
else
{
A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
E = _mm512_set1_epi64( 0x510E527FADE682D1 );
F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
A = v512_64( 0x6A09E667F3BCC908 );
B = v512_64( 0xBB67AE8584CAA73B );
C = v512_64( 0x3C6EF372FE94F82B );
D = v512_64( 0xA54FF53A5F1D36F1 );
E = v512_64( 0x510E527FADE682D1 );
F = v512_64( 0x9B05688C2B3E6C1F );
G = v512_64( 0x1F83D9ABFB41BD6B );
H = v512_64( 0x5BE0CD19137E2179 );
}
for ( i = 0; i < 80; i += 8 )
@@ -191,14 +563,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
r[0] = _mm512_add_epi64( A, v512_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm512_add_epi64( B, v512_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm512_add_epi64( C, v512_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm512_add_epi64( D, v512_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm512_add_epi64( E, v512_64( 0x510E527FADE682D1 ) );
r[5] = _mm512_add_epi64( F, v512_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm512_add_epi64( G, v512_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm512_add_epi64( H, v512_64( 0x5BE0CD19137E2179 ) );
}
}
@@ -243,7 +615,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -255,9 +627,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
v512_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
v512_64( sc->count << 3 ), shuff_bswap64 );
sha512_8way_round( sc, sc->buf, sc->val );
mm512_block_bswap_64( dst, sc->val );
@@ -295,7 +667,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -317,7 +689,7 @@ do { \
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -364,14 +736,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
}
else
{
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
A = v256_64( 0x6A09E667F3BCC908 );
B = v256_64( 0xBB67AE8584CAA73B );
C = v256_64( 0x3C6EF372FE94F82B );
D = v256_64( 0xA54FF53A5F1D36F1 );
E = v256_64( 0x510E527FADE682D1 );
F = v256_64( 0x9B05688C2B3E6C1F );
G = v256_64( 0x1F83D9ABFB41BD6B );
H = v256_64( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
@@ -405,14 +777,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
r[0] = _mm256_add_epi64( A, v256_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, v256_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, v256_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, v256_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, v256_64( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, v256_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, v256_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, v256_64( 0x5BE0CD19137E2179 ) );
}
}
@@ -457,7 +829,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -469,9 +841,9 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
v256_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
v256_64( sc->count << 3 ), shuff_bswap64 );
sha512_4way_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );

View File

@@ -5,11 +5,32 @@
#include "simd-utils.h"
#include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__)
// Experimental, untested
// Need to substitute for sph_sha512
typedef struct
{
uint64_t buf[128>>3];
uint64_t val[8];
uint64_t count;
} sha512_context __attribute__ ((aligned (64)));
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in );
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
typedef struct {
typedef struct
{
__m512i buf[128>>3];
__m512i val[8];
uint64_t count;
@@ -28,7 +49,8 @@ void sha512_8way_full( void *dst, const void *data, size_t len );
// SHA-512 4 way
typedef struct {
typedef struct
{
__m256i buf[128>>3];
__m256i val[8];
uint64_t count;

View File

@@ -16,14 +16,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
ctx->val[0] = v512_64( 0x22312194FC2BF72C );
ctx->val[1] = v512_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v512_64( 0x2393B86B6F53B151 );
ctx->val[3] = v512_64( 0x963877195940EABD );
ctx->val[4] = v512_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v512_64( 0xBE5E1E2553863992 );
ctx->val[6] = v512_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v512_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
@@ -43,7 +43,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
const __m512i eight = v512_64( 0x0000000800000000 );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
@@ -84,14 +84,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
ctx->val[0] = v256_64( 0x22312194FC2BF72C );
ctx->val[1] = v256_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v256_64( 0x2393B86B6F53B151 );
ctx->val[3] = v256_64( 0x963877195940EABD );
ctx->val[4] = v256_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v256_64( 0xBE5E1E2553863992 );
ctx->val[6] = v256_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v256_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
@@ -111,7 +111,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(