This commit is contained in:
Jay D Dee
2024-07-01 00:33:19 -04:00
parent c47c4a8885
commit 47e24b50e8
23 changed files with 2097 additions and 2855 deletions

View File

@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
\
/* AddRoundConstant P1024 */\
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* AddRoundConstant P1024 */\
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\

View File

@@ -47,25 +47,19 @@
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || defined(__SSE2__)
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
}
#else
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
#endif
#if defined(VL256)

View File

@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );

View File

@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
casti_v128u32( dst, u ) = sc->val[u];
}
#endif

View File

@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
sha256_4way_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
}
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
casti_m128i( U, k ) );
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */

View File

@@ -569,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
__m128i STATE0, STATE1, MSG, TMP;
// Load initial values
TMP = casti_m128i( istate, 0 );
STATE1 = casti_m128i( istate, 1 );
TMP = casti_v128u32( istate, 0 );
STATE1 = casti_v128u32( istate, 1 );
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
@@ -578,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
// Save current hash
casti_m128i( sstate, 0 ) = STATE0;
casti_m128i( sstate, 1 ) = STATE1;
casti_v128u32( sstate, 0 ) = STATE0;
casti_v128u32( sstate, 1 ) = STATE1;
// Rounds 0 to 3
MSG = casti_m128i( msg, 0 );
MSG = casti_v128u32( msg, 0 );
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
MSG = _mm_add_epi32( MSG, TMP );
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
MSG = _mm_shuffle_epi32( MSG, 0x0E );
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_m128i( ostate, 1 ) = STATE1;
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_v128u32( ostate, 1 ) = STATE1;
}
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -601,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
STATE0_X = casti_m128i( state_mid_X, 0 );
STATE1_X = casti_m128i( state_mid_X, 1 );
STATE0_Y = casti_m128i( state_mid_Y, 0 );
STATE1_Y = casti_m128i( state_mid_Y, 1 );
STATE0_X = casti_v128u32( state_mid_X, 0 );
STATE1_X = casti_v128u32( state_mid_X, 1 );
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMSG0_X = casti_v128u32( msg_X, 0 );
TMSG0_Y = casti_v128u32( msg_Y, 0 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
// Rounds 4 to 7
TMSG1_X = casti_m128i( msg_X, 1 );
TMSG1_Y = casti_m128i( msg_Y, 1 );
TMSG1_X = casti_v128u32( msg_X, 1 );
TMSG1_Y = casti_v128u32( msg_Y, 1 );
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -638,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
// Rounds 12 to 15
TMSG3_X = casti_m128i( msg_X, 3 );
TMSG3_Y = casti_m128i( msg_Y, 3 );
TMSG3_X = casti_v128u32( msg_X, 3 );
TMSG3_Y = casti_v128u32( msg_Y, 3 );
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -867,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
// Add saved state to new state
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
// Unshuffle & save state
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
}
#endif // SHA

View File

@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
xa0 = mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_16 do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
xa0 = mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_8 do { \

View File

@@ -62,8 +62,6 @@ union u32 {
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
#endif
//TODO aarch support for widening multiply
#if defined(__SSE2__)
#define vec_and(x,y) ((x)&(y))

View File

@@ -204,11 +204,11 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -372,11 +372,11 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm256_intrlv80_4x64( vdata, edata );

View File

@@ -418,11 +418,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
@@ -681,11 +681,11 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
@@ -895,11 +895,11 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = v128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_m128i( pdata, 4 ) );
edata[0] = v128_swap64_32( casti_v128u32( pdata, 0 ) );
edata[1] = v128_swap64_32( casti_v128u32( pdata, 1 ) );
edata[2] = v128_swap64_32( casti_v128u32( pdata, 2 ) );
edata[3] = v128_swap64_32( casti_v128u32( pdata, 3 ) );
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm256_intrlv80_4x64( vdata, edata );
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );