Compare commits

...

3 Commits

Author SHA1 Message Date
Jay D Dee
e625ed5420 v3.9.5.3 2019-07-12 10:42:38 -04:00
Jay D Dee
9abc19a30a v3.9.5.2 2019-07-04 12:12:11 -04:00
Jay D Dee
0d769ee0fe v3.9.5.1 2019-07-02 15:10:38 -04:00
75 changed files with 2849 additions and 2118 deletions

View File

@@ -38,9 +38,40 @@ supported.
Change Log
----------
v3.9.5.3
Fix crash mining hodl with aes-sse42.
More restructuring and share report tweaks.
v3.9.5.2
Revert bswap-interleave optiization for causing crashes on Windows.
v3.9.5.1
Fixed skein2 crash on Windows.
Fixed CPU temperature reading on Ubuntu 19.04.
Realigned log message colours, blue is used to report normal activity and
yellow is only used to report abnormal activity.
Changed stats colours, yellow now means below average, white is average
range. Tweaked colour thresholds.
Changed colour of stratum difficulty change messages to blue to match other
normal protocol messages. Blue messages (block, stratum, submit) will no
longer be displayed when using -q option.
Added job id to new block, share submit, and share result messages and added
new nessage when a new job is received for an existing block. This will for
better troubleshooting of invalid job id rejects seen at zergpool.
Some more restructuring.
v3.9.5
New share reporting information includes calculation of equivalent hhashrate
New share reporting information includes calculation of equivalent hashrate
based on share difficulty, network latency, 5 minute summary.
Per-thread hash rate reports are disabled by default.
New command line option --hash-meter added to enable per-thread hash rates.

View File

@@ -368,9 +368,9 @@ bool submit_solution( struct work *work, void *hash,
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
accepted_share_count + rejected_share_count + 1,
thr->id );
thr->id, work->job_id );
return true;
}
else
@@ -385,9 +385,12 @@ bool submit_lane_solution( struct work *work, void *hash,
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
// accepted_share_count + rejected_share_count + 1,
// thr->id, lane );
applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
accepted_share_count + rejected_share_count + 1, thr->id,
lane );
lane, work->job_id );
return true;
}
else

View File

@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

View File

@@ -412,34 +412,16 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm128_bswap_32( *(buf + 0) ); \
M[0x1] = mm128_bswap_32( *(buf + 1) ); \
M[0x2] = mm128_bswap_32( *(buf + 2) ); \
M[0x3] = mm128_bswap_32( *(buf + 3) ); \
M[0x4] = mm128_bswap_32( *(buf + 4) ); \
M[0x5] = mm128_bswap_32( *(buf + 5) ); \
M[0x6] = mm128_bswap_32( *(buf + 6) ); \
M[0x7] = mm128_bswap_32( *(buf + 7) ); \
M[0x8] = mm128_bswap_32( *(buf + 8) ); \
M[0x9] = mm128_bswap_32( *(buf + 9) ); \
M[0xA] = mm128_bswap_32( *(buf + 10) ); \
M[0xB] = mm128_bswap_32( *(buf + 11) ); \
M[0xC] = mm128_bswap_32( *(buf + 12) ); \
M[0xD] = mm128_bswap_32( *(buf + 13) ); \
M[0xE] = mm128_bswap_32( *(buf + 14) ); \
M[0xF] = mm128_bswap_32( *(buf + 15) ); \
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
mm128_block_bswap_32( M, buf ); \
mm128_block_bswap_32( M+8, buf+8 ); \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -464,6 +446,54 @@ do { \
// current impl
#if defined(__SSSE3__)
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
{ \
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
} while(0)
#else // SSE2
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
{ \
M0 = mm128_bswap_32( buf[0] ); \
M1 = mm128_bswap_32( buf[1] ); \
M2 = mm128_bswap_32( buf[2] ); \
M3 = mm128_bswap_32( buf[3] ); \
M4 = mm128_bswap_32( buf[4] ); \
M5 = mm128_bswap_32( buf[5] ); \
M6 = mm128_bswap_32( buf[6] ); \
M7 = mm128_bswap_32( buf[7] ); \
M8 = mm128_bswap_32( buf[8] ); \
M9 = mm128_bswap_32( buf[9] ); \
MA = mm128_bswap_32( buf[10] ); \
MB = mm128_bswap_32( buf[11] ); \
MC = mm128_bswap_32( buf[12] ); \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
} while(0)
#endif // SSSE3 else SSE2
#define COMPRESS32_4WAY( rounds ) \
do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -486,22 +516,7 @@ do { \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm128_bswap_32( buf[ 0] ); \
M1 = mm128_bswap_32( buf[ 1] ); \
M2 = mm128_bswap_32( buf[ 2] ); \
M3 = mm128_bswap_32( buf[ 3] ); \
M4 = mm128_bswap_32( buf[ 4] ); \
M5 = mm128_bswap_32( buf[ 5] ); \
M6 = mm128_bswap_32( buf[ 6] ); \
M7 = mm128_bswap_32( buf[ 7] ); \
M8 = mm128_bswap_32( buf[ 8] ); \
M9 = mm128_bswap_32( buf[ 9] ); \
MA = mm128_bswap_32( buf[10] ); \
MB = mm128_bswap_32( buf[11] ); \
MC = mm128_bswap_32( buf[12] ); \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
BLAKE256_4WAY_BLOCK_BSWAP32; \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -519,14 +534,14 @@ do { \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
H0 = mm128_xor4( V8, V0, S0, H0 ); \
H1 = mm128_xor4( V9, V1, S1, H1 ); \
H2 = mm128_xor4( VA, V2, S2, H2 ); \
H3 = mm128_xor4( VB, V3, S3, H3 ); \
H4 = mm128_xor4( VC, V4, S0, H4 ); \
H5 = mm128_xor4( VD, V5, S1, H5 ); \
H6 = mm128_xor4( VE, V6, S2, H6 ); \
H7 = mm128_xor4( VF, V7, S3, H7 ); \
} while (0)
#endif
@@ -607,6 +622,7 @@ do { \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
__m256i shuf_bswap32; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -623,22 +639,24 @@ do { \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+1) ); \
M2 = mm256_bswap_32( *(buf+2) ); \
M3 = mm256_bswap_32( *(buf+3) ); \
M4 = mm256_bswap_32( *(buf+4) ); \
M5 = mm256_bswap_32( *(buf+5) ); \
M6 = mm256_bswap_32( *(buf+6) ); \
M7 = mm256_bswap_32( *(buf+7) ); \
M8 = mm256_bswap_32( *(buf+8) ); \
M9 = mm256_bswap_32( *(buf+9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -656,22 +674,14 @@ do { \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
} \
H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
S0 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
S1 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
S2 ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
S3 ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
S0 ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
S1 ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
S2 ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
S3 ), H7 ); \
H0 = mm256_xor4( V8, V0, S0, H0 ); \
H1 = mm256_xor4( V9, V1, S1, H1 ); \
H2 = mm256_xor4( VA, V2, S2, H2 ); \
H3 = mm256_xor4( VB, V3, S3, H3 ); \
H4 = mm256_xor4( VC, V4, S0, H4 ); \
H5 = mm256_xor4( VD, V5, S1, H5 ); \
H6 = mm256_xor4( VE, V6, S2, H6 ); \
H7 = mm256_xor4( VF, V7, S3, H7 ); \
} while (0)
@@ -685,6 +695,7 @@ static void
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
__m128i zero = m128_zero;
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
@@ -694,16 +705,10 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
casti_m128i( ctx->S, 0 ) = m128_zero;
casti_m128i( ctx->S, 1 ) = m128_zero;
casti_m128i( ctx->S, 2 ) = m128_zero;
casti_m128i( ctx->S, 3 ) = m128_zero;
/*
sc->S[0] = _mm_set1_epi32( salt[0] );
sc->S[1] = _mm_set1_epi32( salt[1] );
sc->S[2] = _mm_set1_epi32( salt[2] );
sc->S[3] = _mm_set1_epi32( salt[3] );
*/
casti_m128i( ctx->S, 0 ) = zero;
casti_m128i( ctx->S, 1 ) = zero;
casti_m128i( ctx->S, 2 ) = zero;
casti_m128i( ctx->S, 3 ) = zero;
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
@@ -796,14 +801,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
blake32_4way( ctx, buf, 64 );
}
casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
}
#if defined (__AVX2__)
@@ -816,11 +814,21 @@ static void
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set1_epi32( salt[i] );
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
casti_m256i( sc->S, 2 ) = zero;
casti_m256i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
@@ -872,14 +880,10 @@ static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
// union {
__m256i buf[16];
// sph_u32 dummy;
// } u;
size_t ptr, k;
__m256i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -923,9 +927,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm256_bswap_32( sc->H[k] );
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
}
#endif

View File

@@ -83,7 +83,7 @@ void blake2s_4way_hash( void *output, const void *input )
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm128_dintrlv_4x32( output, output+32, output+64, output+96,
dintrlv_4x32( output, output+32, output+64, output+96,
vhash, 256 );
}

View File

@@ -412,18 +412,18 @@ static const sph_u64 CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
@@ -464,80 +464,76 @@ static const sph_u64 CB[16] = {
//current impl
#define COMPRESS64_4WAY do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
ROUND_B_4WAY(6); \
ROUND_B_4WAY(7); \
ROUND_B_4WAY(8); \
ROUND_B_4WAY(9); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
#define COMPRESS64_4WAY do \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
__m256i shuf_bswap64; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB7 ) ); \
shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
ROUND_B_4WAY(6); \
ROUND_B_4WAY(7); \
ROUND_B_4WAY(8); \
ROUND_B_4WAY(9); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
H0 = mm256_xor4( V8, V0, S0, H0 ); \
H1 = mm256_xor4( V9, V1, S1, H1 ); \
H2 = mm256_xor4( VA, V2, S2, H2 ); \
H3 = mm256_xor4( VB, V3, S3, H3 ); \
H4 = mm256_xor4( VC, V4, S0, H4 ); \
H5 = mm256_xor4( VD, V5, S1, H5 ); \
H6 = mm256_xor4( VE, V6, S2, H6 ); \
H7 = mm256_xor4( VF, V7, S3, H7 ); \
} while (0)
#endif
@@ -547,13 +543,23 @@ static void
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
const sph_u64 *salt )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set1_epi64x( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set1_epi64x( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
casti_m256i( sc->S, 2 ) = zero;
casti_m256i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
static void
@@ -604,15 +610,11 @@ static void
blake64_4way_close( blake_4way_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
{
// union {
__m256i buf[16];
// sph_u64 dummy;
// } u;
size_t ptr, k;
__m256i buf[16];
size_t ptr;
unsigned bit_len;
uint64_t z, zz;
sph_u64 th, tl;
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -665,9 +667,7 @@ blake64_4way_close( blake_4way_big_context *sc,
blake64_4way( sc, buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_bswap_64( sc->H[k] );
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
void

View File

@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

View File

@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_decred_4way( struct work *work, uint32_t max_nonce,

View File

@@ -113,50 +113,27 @@ static const uint32_t IV256[] = {
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-16 ] ), \
ss2( qt[ (i)-15 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-14 ] ), \
ss0( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-12 ] ), \
ss2( qt[ (i)-11 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-10 ] ), \
ss0( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
ss2( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
ss0( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
ss2( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
ss0( qt[ (i)- 1 ] ) ) ) ) ), \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
_mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
_mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
ss5( qt[ (i)- 1 ] ) ) ) ) ), \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define Ws0 \
@@ -357,17 +334,11 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
_mm_xor_si128( qt[18], qt[19] ) ),
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
_mm_xor_si128( qt[22], qt[23] ) ) );
xh = _mm_xor_si128( xl,
_mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
_mm_xor_si128( qt[26], qt[27] ) ),
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
_mm_xor_si128( qt[30], qt[31] ) )));
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm_xor_si128( xl, _mm_xor_si128(
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
@@ -695,22 +666,15 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define expand2s8( qt, M, H, i) \
_mm256_add_epi32( \
_mm256_add_epi32( \
_mm256_add_epi32( \
_mm256_add_epi32( \
_mm256_add_epi32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ) ), \
_mm256_add_epi32( qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ) ), \
_mm256_add_epi32( \
_mm256_add_epi32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ) ), \
_mm256_add_epi32( qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ) ) ), \
_mm256_add_epi32( \
_mm256_add_epi32( \
_mm256_add_epi32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ) ), \
_mm256_add_epi32( qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ) ), \
_mm256_add_epi32( \
_mm256_add_epi32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ) ), \
_mm256_add_epi32( s8s4( qt[ (i)- 2 ] ), \
s8s5( qt[ (i)- 1 ] ) ) ) ) ), \
mm256_add4_32( \
mm256_add4_32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ), \
mm256_add4_32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ), \
mm256_add4_32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ), \
mm256_add4_32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ), \
s8s4( qt[ (i)- 2 ] ), s8s5( qt[ (i)- 1 ] ) ) ), \
add_elt_s8( M, H, (i)-16 ) )
@@ -913,16 +877,11 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
qt[31] = expand2s8( qt, M, H, 31 );
xl = _mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
_mm256_xor_si256( qt[18], qt[19] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
_mm256_xor_si256( qt[22], qt[23] ) ) );
xh = _mm256_xor_si256( xl,
_mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
_mm256_xor_si256( qt[26], qt[27] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
_mm256_xor_si256( qt[30], qt[31] ) )));
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm256_add_epi32(
_mm256_xor_si256( M[0],

View File

@@ -569,28 +569,20 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
#define sb0(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
_mm256_slli_epi64( (x), 3) ), \
_mm256_xor_si256( mm256_rol_64( (x), 4), \
mm256_rol_64( (x), 37) ) )
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
mm256_rol_64( (x), 4), mm256_rol_64( (x),37) )
#define sb1(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
_mm256_slli_epi64( (x), 2) ), \
_mm256_xor_si256( mm256_rol_64( (x), 13), \
mm256_rol_64( (x), 43) ) )
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 2), \
mm256_rol_64( (x),13), mm256_rol_64( (x),43) )
#define sb2(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
_mm256_slli_epi64( (x), 1) ), \
_mm256_xor_si256( mm256_rol_64( (x), 19), \
mm256_rol_64( (x), 53) ) )
mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 1), \
mm256_rol_64( (x),19), mm256_rol_64( (x),53) )
#define sb3(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
_mm256_slli_epi64( (x), 2) ), \
_mm256_xor_si256( mm256_rol_64( (x), 28), \
mm256_rol_64( (x), 59) ) )
mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 2), \
mm256_rol_64( (x),28), mm256_rol_64( (x),59) )
#define sb4(x) \
_mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
@@ -618,55 +610,32 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1b( qt, M, H, i ) \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
sb2( qt[ (i)-15 ] ) ), \
_mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
sb0( qt[ (i)-13 ] ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
sb2( qt[ (i)-11 ] ) ), \
_mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
sb0( qt[ (i)- 9 ] ) ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
sb2( qt[ (i)- 7 ] ) ), \
_mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
sb0( qt[ (i)- 5 ] ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
sb2( qt[ (i)- 3 ] ) ), \
_mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
sb0( qt[ (i)- 1 ] ) ) ) ) ), \
_mm256_add_epi64( mm256_add4_64( \
mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
sb3( qt[ (i)-10 ] ), sb0( qt[ (i)- 9 ] )), \
mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
#define expand2b( qt, M, H, i) \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
_mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
_mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( \
_mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
_mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
_mm256_add_epi64( \
_mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
_mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
sb5( qt[ (i)- 1 ] ) ) ) ) ), \
_mm256_add_epi64( mm256_add4_64( \
mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ), \
mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
add_elt_b( M, H, (i)-16 ) )
#define Wb0 \
_mm256_add_epi64( \
_mm256_add_epi64( \
@@ -864,95 +833,90 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 );
xl = _mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
_mm256_xor_si256( qt[18], qt[19] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
_mm256_xor_si256( qt[22], qt[23] ) ) );
xh = _mm256_xor_si256( xl,
_mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
_mm256_xor_si256( qt[26], qt[27] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
_mm256_xor_si256( qt[30], qt[31] ) )));
xl = _mm256_xor_si256(
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm256_add_epi64(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
_mm256_srli_epi64( qt[16], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
_mm256_srli_epi64( qt[16], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
dH[ 1] = _mm256_add_epi64(
_mm256_xor_si256( M[1],
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
_mm256_slli_epi64( qt[17], 8 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
_mm256_xor_si256( M[1],
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
_mm256_slli_epi64( qt[17], 8 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
dH[ 2] = _mm256_add_epi64(
_mm256_xor_si256( M[2],
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
_mm256_slli_epi64( qt[18], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
_mm256_xor_si256( M[2],
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
_mm256_slli_epi64( qt[18], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
dH[ 3] = _mm256_add_epi64(
_mm256_xor_si256( M[3],
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
_mm256_slli_epi64( qt[19], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
_mm256_xor_si256( M[3],
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
_mm256_slli_epi64( qt[19], 5 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
dH[ 4] = _mm256_add_epi64(
_mm256_xor_si256( M[4],
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
_mm256_slli_epi64( qt[20], 0 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
_mm256_xor_si256( M[4],
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
_mm256_slli_epi64( qt[20], 0 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
dH[ 5] = _mm256_add_epi64(
_mm256_xor_si256( M[5],
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
_mm256_srli_epi64( qt[21], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
_mm256_xor_si256( M[5],
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
_mm256_srli_epi64( qt[21], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
dH[ 6] = _mm256_add_epi64(
_mm256_xor_si256( M[6],
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
_mm256_slli_epi64( qt[22], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
_mm256_xor_si256( M[6],
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
_mm256_slli_epi64( qt[22], 6 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
dH[ 7] = _mm256_add_epi64(
_mm256_xor_si256( M[7],
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
_mm256_slli_epi64( qt[23], 2 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
_mm256_xor_si256( M[7],
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
_mm256_slli_epi64( qt[23], 2 ) ) ),
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[4], 9 ),
mm256_rol_64( dH[4], 9 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
_mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[5], 10 ),
mm256_rol_64( dH[5], 10 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
_mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
dH[10] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[6], 11 ),
mm256_rol_64( dH[6], 11 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
_mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
_mm256_xor_si256( qt[17], qt[10] ) ) );
dH[11] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[7], 12 ),
mm256_rol_64( dH[7], 12 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
_mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
_mm256_xor_si256( qt[18], qt[11] ) ) );
dH[12] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[0], 13 ),
mm256_rol_64( dH[0], 13 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
_mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
_mm256_xor_si256( qt[19], qt[12] ) ) );
dH[13] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[1], 14 ),
mm256_rol_64( dH[1], 14 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
_mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
_mm256_xor_si256( qt[20], qt[13] ) ) );
dH[14] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[2], 15 ),
mm256_rol_64( dH[2], 15 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
_mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
_mm256_xor_si256( qt[21], qt[14] ) ) );
dH[15] = _mm256_add_epi64( _mm256_add_epi64(
mm256_rol_64( dH[3], 16 ),
mm256_rol_64( dH[3], 16 ),
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
_mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
_mm256_xor_si256( qt[22], qt[15] ) ) );

View File

@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
myrgr_4way_ctx_holder ctx;
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,7 +43,7 @@ void myriad_4way_hash( void *output, const void *input )
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, output );
@@ -89,7 +89,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -531,16 +531,17 @@ static const sph_u32 T512[64][16] = {
#define INPUT_BIG \
do { \
const __m256i zero = _mm256_setzero_si256(); \
__m256i db = *buf; \
const sph_u32 *tp = &T512[0][0]; \
m0 = m256_zero; \
m1 = m256_zero; \
m2 = m256_zero; \
m3 = m256_zero; \
m4 = m256_zero; \
m5 = m256_zero; \
m6 = m256_zero; \
m7 = m256_zero; \
m0 = zero; \
m1 = zero; \
m2 = zero; \
m3 = zero; \
m4 = zero; \
m5 = zero; \
m6 = zero; \
m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
@@ -913,9 +914,7 @@ void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{
__m256i *out = (__m256i*)dst;
__m256i pad[1];
size_t u;
int ch, cl;
sph_enc32be( &ch, sc->count_high );
@@ -925,8 +924,8 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
0UL, 0x80UL, 0UL, 0x80UL );
hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad );
for ( u = 0; u < 8; u ++ )
out[u] = mm256_bswap_32( sc->h[u] );
mm256_block_bswap_32( (__m256i*)dst, sc->h );
}
#ifdef __cplusplus

View File

@@ -83,7 +83,7 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
keys[14] = tmp1;
}
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AESENC(i,j) \
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
}
}
#else // NO SSE4.2
#else // NO AVX
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
{

View File

@@ -166,7 +166,7 @@ bool register_hodl_algo( algo_gate_t* gate )
// return false;
// }
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
gate->optimizations = AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;

View File

@@ -17,7 +17,7 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
const uint32_t StartChunk = ThreadID * Chunk;
const uint32_t EndChunk = StartChunk + Chunk;
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
uint64_t* desination[ SHA512_PARALLEL_N ];
@@ -64,7 +64,7 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -140,7 +140,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
return(0);
#else // no SSE4.2
#else // no AVX
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -148,6 +148,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
CacheEntry Cache;
uint32_t CollisionCount = 0;
int threadNumber = mythr->id;
swab32_array( BlockHdr, pdata, 20 );
// Search for pattern in psuedorandom data
@@ -205,7 +206,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
*hashes_done = CollisionCount;
return(0);
#endif // SSE4.2 else
#endif // AVX else
}

View File

@@ -23,6 +23,7 @@ typedef struct
__m256i h[8];
__m256i w[80];
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
__m128i h[8];
__m128i w[80];
#else
@@ -32,7 +33,8 @@ typedef struct
#ifdef __AVX2__
#define SHA512_PARALLEL_N 8
#elif defined(__SSE$_2__)
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
#define SHA512_PARALLEL_N 4
#else
#define SHA512_PARALLEL_N 1 // dummy value

View File

@@ -1,6 +1,6 @@
#ifndef __AVX2__
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
//Dependencies

View File

@@ -6,7 +6,7 @@
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
#ifdef __SSE4_2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AES_PARALLEL_N 8

View File

@@ -88,8 +88,9 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -115,12 +116,11 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
0
};
// for ( int i=0; i < 19; i++ )
// be32enc( &endiandata[i], pdata[i] );
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
// uint64_t *edata = (uint64_t*)endiandata;
// mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ )
{
@@ -143,7 +143,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
// && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
{
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;

View File

@@ -21,8 +21,9 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -31,7 +32,9 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
// const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
@@ -41,7 +44,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;

View File

@@ -77,6 +77,24 @@ static const sph_u32 V_INIT[5][8] = {
}
};
#if SPH_LUFFA_PARALLEL
static const sph_u64 RCW010[8] = {
SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
};
static const sph_u64 RCW014[8] = {
SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
};
#else
static const sph_u32 RC00[8] = {
SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
@@ -105,20 +123,18 @@ static const sph_u32 RC14[8] = {
SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
};
#if SPH_LUFFA_PARALLEL
static const sph_u64 RCW010[8] = {
SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
static const sph_u32 RC30[8] = {
SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
};
static const sph_u64 RCW014[8] = {
SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
static const sph_u32 RC34[8] = {
SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
};
#endif
@@ -137,19 +153,6 @@ static const sph_u32 RC24[8] = {
SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
};
static const sph_u32 RC30[8] = {
SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
};
static const sph_u32 RC34[8] = {
SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
};
#if SPH_LUFFA_PARALLEL

View File

@@ -5,7 +5,7 @@
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
//#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
__thread uint64_t* lyra2h_4way_matrix;
@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
16, 16, 16 );

View File

@@ -78,7 +78,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
@@ -90,7 +90,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -116,7 +116,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -91,8 +91,9 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -104,7 +105,10 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap_intrlv80_8x32( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_8x32( vdata, pdata );
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
@@ -115,7 +119,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
@@ -161,7 +165,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -181,7 +185,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
@@ -192,7 +196,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -214,7 +218,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -146,6 +146,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -157,7 +158,10 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm256_bswap_intrlv80_8x32( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {

View File

@@ -168,7 +168,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(128) edata[36];
uint32_t vdata[4][36] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -217,7 +217,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -207,6 +207,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
@@ -222,18 +223,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_set(bns1, bns0);
mpz_set(product, bns0);
for ( i=1; i < 7; i++ )
mpz_set(product, bns0);
for ( i=1; i < 7; i++ )
{
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
mpz_add(bns1, bns1, bns0);
mpz_mul(product, product, bns0);
mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
mpz_add(bns1, bns1, bns0);
mpz_mul(product, product, bns0);
}
mpz_mul(product, product, bns1);
mpz_mul(product, product, product);
mpz_mul(product, product, product);
bytes = mpz_sizeinbase(product, 256);
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
@@ -243,27 +244,27 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
mpf_set_prec_raw(magifpi, prec);
mpf_set_prec_raw(mptmp, prec);
mpf_set_prec_raw(mpt1, prec);
mpf_set_prec_raw(mpt2, prec);
mpf_set_prec_raw(magifpi, prec);
mpf_set_prec_raw(mptmp, prec);
mpf_set_prec_raw(mpt1, prec);
mpf_set_prec_raw(mpt2, prec);
usw_ = sw2_(n/2);
mpzscale = 1;
mpzscale = 1;
mpz_set_ui(magisw, usw_);
for ( i = 0; i < 5; i++ )
{
mpf_set_d(mpt1, 0.25*mpzscale);
mpf_sub(mpt1, mpt1, mpt2);
mpf_sub(mpt1, mpt1, mpt2);
mpf_abs(mpt1, mpt1);
mpf_div(magifpi, magifpi0, mpt1);
mpf_pow_ui(mptmp, mpten, digits >> 1);
mpf_mul(magifpi, magifpi, mptmp);
mpz_set_f(magipi, magifpi);
mpz_set_f(magipi, magifpi);
mpz_add(magipi,magipi,magisw);
mpz_add(product,product,magipi);
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_add(bns1, bns1, bns0);
mpz_mul(product,product,bns1);
mpz_cdiv_q (product, product, bns0);
@@ -275,18 +276,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
SHA256_Init( &ctxf_sha256 );
SHA256_Update( &ctxf_sha256, bdata, bytes );
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
}
}
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
{
if ( hash_[i] != target_[i] )
{
rc = hash_[i] < target_[i];
break;
}
}
if ( hash_[i] != target_[i] )
{
rc = hash_[i] < target_[i];
break;
}
}
if ( unlikely(rc) )
{
if ( opt_debug )
@@ -299,15 +300,15 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
hash_str,
target_str);
}
work_set_target_ratio( work, hash );
pdata[19] = data[19];
goto out;
}
submit_solution( work, hash, mythr );
}
} while (n < max_nonce && !work_restart[thr_id].restart);
pdata[19] = n;
out:
// can this be skipped after finding a share? Seems to work ok.
//out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);
mpf_set_prec_raw(mptmp, prec0);

View File

@@ -70,7 +70,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
@@ -122,7 +122,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -50,6 +50,7 @@ void anime_4way_hash( void *state, const void *input )
__m256i vh_mask;
const uint32_t mask = 8;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i zero = _mm256_setzero_si256();
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
@@ -59,8 +60,7 @@ void anime_4way_hash( void *state, const void *input )
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -114,8 +114,7 @@ void anime_4way_hash( void *state, const void *input )
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
if ( mm256_anybits1( vh_mask ) )
{
@@ -139,8 +138,7 @@ void anime_4way_hash( void *state, const void *input )
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
if ( mm256_anybits1( vh_mask ) )
{
@@ -165,6 +163,7 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -189,7 +188,9 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
0
};
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])

View File

@@ -575,7 +575,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
// uint32_t *hash7 = &(hash[25]);
// uint32_t lane_hash[8];
// uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;

View File

@@ -51,6 +51,7 @@ void quark_4way_hash( void *state, const void *input )
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const uint32_t mask = 8;
const __m256i zero = _mm256_setzero_si256();
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
@@ -60,8 +61,7 @@ void quark_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -115,8 +115,7 @@ void quark_4way_hash( void *state, const void *input )
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
if ( mm256_anybits1( vh_mask ) )
{
@@ -141,8 +140,7 @@ void quark_4way_hash( void *state, const void *input )
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
if ( mm256_anybits1( vh_mask ) )
{
@@ -170,6 +168,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t *pdata = work->data;
@@ -179,7 +178,9 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -191,7 +192,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
for ( int i = 0; i < 4; i++ )
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
{
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;

View File

@@ -118,7 +118,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
mm256_extr_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;

View File

@@ -86,8 +86,7 @@ static const sph_u32 K256[64] = {
// SHA-256 4 way
#define SHA2s_MEXP( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] );
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
@@ -115,9 +114,8 @@ static const sph_u32 K256[64] = {
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
register __m128i T1, T2; \
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
_mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
_mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
@@ -129,22 +127,8 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
register __m128i A, B, C, D, E, F, G, H;
__m128i W[16];
W[ 0] = mm128_bswap_32( in[ 0] );
W[ 1] = mm128_bswap_32( in[ 1] );
W[ 2] = mm128_bswap_32( in[ 2] );
W[ 3] = mm128_bswap_32( in[ 3] );
W[ 4] = mm128_bswap_32( in[ 4] );
W[ 5] = mm128_bswap_32( in[ 5] );
W[ 6] = mm128_bswap_32( in[ 6] );
W[ 7] = mm128_bswap_32( in[ 7] );
W[ 8] = mm128_bswap_32( in[ 8] );
W[ 9] = mm128_bswap_32( in[ 9] );
W[10] = mm128_bswap_32( in[10] );
W[11] = mm128_bswap_32( in[11] );
W[12] = mm128_bswap_32( in[12] );
W[13] = mm128_bswap_32( in[13] );
W[14] = mm128_bswap_32( in[14] );
W[15] = mm128_bswap_32( in[15] );
mm128_block_bswap_32( W, in );
mm128_block_bswap_32( W+8, in+8 );
A = r[0];
B = r[1];
@@ -266,7 +250,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
void sha256_4way_close( sha256_4way_context *sc, void *dst )
{
unsigned ptr, u;
unsigned ptr;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
@@ -294,8 +278,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
mm128_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m128i*)dst)[u] = mm128_bswap_32( sc->val[u] );
mm128_block_bswap_32( dst, sc->val );
}
#if defined(__AVX2__)
@@ -326,15 +309,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
#define SHA2x_MEXP( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
SSG2_1x( W[a] ), W[b] ), SSG2_0x( W[c] ) ), W[d] );
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
register __m256i T1, T2; \
T1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( H, BSG2_1x(E) ), CHx(E, F, G) ), \
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
@@ -346,22 +327,8 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
register __m256i A, B, C, D, E, F, G, H;
__m256i W[16];
W[ 0] = mm256_bswap_32( in[ 0] );
W[ 1] = mm256_bswap_32( in[ 1] );
W[ 2] = mm256_bswap_32( in[ 2] );
W[ 3] = mm256_bswap_32( in[ 3] );
W[ 4] = mm256_bswap_32( in[ 4] );
W[ 5] = mm256_bswap_32( in[ 5] );
W[ 6] = mm256_bswap_32( in[ 6] );
W[ 7] = mm256_bswap_32( in[ 7] );
W[ 8] = mm256_bswap_32( in[ 8] );
W[ 9] = mm256_bswap_32( in[ 9] );
W[10] = mm256_bswap_32( in[10] );
W[11] = mm256_bswap_32( in[11] );
W[12] = mm256_bswap_32( in[12] );
W[13] = mm256_bswap_32( in[13] );
W[14] = mm256_bswap_32( in[14] );
W[15] = mm256_bswap_32( in[15] );
mm256_block_bswap_32( W , in );
mm256_block_bswap_32( W+8, in+8 );
A = r[0];
B = r[1];
@@ -484,7 +451,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
void sha256_8way_close( sha256_8way_context *sc, void *dst )
{
unsigned ptr, u;
unsigned ptr;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
@@ -513,8 +480,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
sha256_8way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
mm256_block_bswap_32( dst, sc->val );
}
@@ -596,9 +562,8 @@ static const sph_u64 K512[80] = {
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
register __m256i T1, T2; \
T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
_mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
_mm256_set1_epi64x( K512[i] ) ), W[i] ); \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
_mm256_set1_epi64x( K512[i] ), W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
@@ -611,11 +576,12 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
register __m256i A, B, C, D, E, F, G, H;
__m256i W[80];
for ( i = 0; i < 16; i++ )
W[i] = mm256_bswap_64( in[i] );
mm256_block_bswap_64( W , in );
mm256_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ],
SSG5_0( W[ i-15 ] ), W[ i-16 ] );
A = r[0];
B = r[1];
@@ -689,7 +655,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
void sha512_4way_close( sha512_4way_context *sc, void *dst )
{
unsigned ptr, u;
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
@@ -711,8 +677,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
mm256_block_bswap_64( dst, sc->val );
}
#endif // __AVX2__

View File

@@ -36,6 +36,8 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -58,7 +60,10 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
0 };
// Need big endian data
mm256_bswap_intrlv80_8x32( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -79,8 +84,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
@@ -129,8 +133,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -152,7 +157,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
mm128_bswap_intrlv80_4x32( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm128_intrlv_4x32( vdata, edata, edata, edata, edata, 640 );
// mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -168,7 +175,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{

View File

@@ -99,7 +99,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
mm256_extr_lane_8x32( lane_hash, hashx, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + i;
@@ -111,7 +111,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
{
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + 8 + i;
@@ -163,6 +163,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
@@ -186,8 +187,12 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
swab32_array( edata, pdata, 20 );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
// Need big endian data
mm256_bswap_intrlv80_8x32( vdata, pdata );
// mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -204,7 +209,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
@@ -248,6 +253,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
@@ -271,7 +277,10 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
mm128_bswap_intrlv80_4x32( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm128_intrlv_4x32( vdata, edata, edata, edata, edata, 640 );
// mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -287,7 +296,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -20,6 +20,7 @@ static const uint32_t IV512[] =
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
@@ -33,24 +34,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round
k00 = m[0];
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = m[1];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = m[2];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = m[3];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p0 = _mm256_xor_si256( p0, x );
k10 = m[4];
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
k11 = m[5];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = m[6];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = m[7];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p2 = _mm256_xor_si256( p2, x );
@@ -59,129 +60,129 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ) );
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_ror1x32_128( mm256_aesenc_2x128( k01 ) ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
mm256_ror1x32_128( mm256_aesenc_2x128( k02 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k02,
mm256_ror1x32_128( mm256_aesenc_2x128( k03 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( k03,
mm256_ror1x32_128( mm256_aesenc_2x128( k10 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( k10,
mm256_ror1x32_128( mm256_aesenc_2x128( k11 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k11,
mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k12,
mm256_ror1x32_128( mm256_aesenc_2x128( k13 ) ) );
mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p1 = _mm256_xor_si256( p1, x );
// round 2, 6, 10
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p2 = _mm256_xor_si256( p2, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p0 = _mm256_xor_si256( p0, x );
// round 3, 7, 11
k00 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ) );
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k01 ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k02 ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k03 ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p1 = _mm256_xor_si256( p1, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k10 ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ) );
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k11 ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k12 ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
mm256_aesenc_2x128( k12, zero ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k13 ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p3 = _mm256_xor_si256( p3, x );
// round 4, 8, 12
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p0 = _mm256_xor_si256( p0, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p2 = _mm256_xor_si256( p2, x );
@@ -190,36 +191,36 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 13
k00 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k01 ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k02 ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k03 ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k10 ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k11 ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) );
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k13 ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p1 = _mm256_xor_si256( p1, x );

View File

@@ -87,6 +87,7 @@ static const sph_u32 IV512[] = {
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
__m128i p0, p1, p2, p3, x;
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
__m128i *m = (__m128i*)msg;
@@ -101,38 +102,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -140,8 +141,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -149,32 +150,32 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -182,78 +183,78 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
@@ -261,73 +262,73 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p2 = _mm_xor_si128( p2, x );
}
// round 13
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );

View File

@@ -342,6 +342,7 @@ void fft128_2way( void *a )
void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
const __m256i zero = _mm256_setzero_si256();
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
@@ -352,10 +353,10 @@ void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i] = _mm256_unpacklo_epi8( t, zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].v256 ); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[2*i+1] = _mm256_unpackhi_epi8( t, zero ); \
A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].v256 ); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
@@ -365,10 +366,10 @@ do { \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[2*i] = _mm256_unpacklo_epi8( t, zero ); \
A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].v256 ); \
A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
tmp = _mm256_unpackhi_epi8( t, zero ); \
A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT128_Twiddle[ 2*i+1 ].v256 );\
@@ -392,6 +393,7 @@ do { \
void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
{
const __m256i zero = _mm256_setzero_si256();
static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
@@ -402,11 +404,11 @@ void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
#define UNPACK( i ) \
do { \
__m256i t = X[i]; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i ] = _mm256_unpacklo_epi8( t, zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].v256 ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, zero ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
FFT256_Twiddle[ 2*i + 1 ].v256 ); \
A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
@@ -417,11 +419,11 @@ do { \
do { \
__m256i t = X[i]; \
__m256i tmp; \
A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \
A[ 2*i ] = _mm256_unpacklo_epi8( t, zero ); \
A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
FFT256_Twiddle[ 2*i ].v256 ); \
A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
tmp = _mm256_unpackhi_epi8( t, m256_zero ); \
tmp = _mm256_unpackhi_epi8( t, zero ); \
A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \
A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
FFT256_Twiddle[ 2*i + 1 ].v256 ); \
@@ -446,6 +448,8 @@ do { \
fft128_2way( a+256 );
}
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
{
register __m256i S0l, S1l, S2l, S3l;
@@ -453,7 +457,8 @@ void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
__m256i *S = (__m256i*) state;
__m256i *M = (__m256i*) msg;
__m256i *W = (__m256i*) fft;
static const m256_v16 code[] = { mm256_const1_16(185), mm256_const1_16(233) };
static const m256_v16 code[] = { c1_16(185), c1_16(233) };
S0l = _mm256_xor_si256( S[0], M[0] );
S0h = _mm256_xor_si256( S[1], M[1] );

View File

@@ -48,7 +48,7 @@ void skeinhash_4way( void *state, const void *input )
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
@@ -63,7 +63,8 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8];
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -73,7 +74,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -84,7 +87,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;

View File

@@ -23,8 +23,10 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -32,10 +34,20 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
// uint32_t *noncep = vdata + 73; // 9*8 + 1
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
// be32enc( noncep, n );
// be32enc( noncep+2, n+1 );
// be32enc( noncep+4, n+2 );
// be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
@@ -44,8 +56,7 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
uint32_t lane_hash[8];
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -165,24 +165,22 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -190,10 +188,8 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
c11_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -64,13 +64,13 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(128) endiandata[20];
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0,
@@ -87,14 +87,9 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
// we need bigendian data...
for ( int i = 0; i < 20; i++ )
{
be32enc( &endiandata[i], pdata[i] );
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
// precalc midstate
// doing it one way then then interleaving would be faster but too
@@ -108,10 +103,8 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
{
uint32_t mask = masks[m];
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
tribus_hash_4way( hash, vdata );

View File

@@ -164,24 +164,22 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -189,10 +187,8 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x11_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -171,24 +171,22 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -196,10 +194,8 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x11gost_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -114,6 +114,7 @@ int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -124,7 +125,9 @@ int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0cff;
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(

View File

@@ -78,29 +78,26 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
((uint32_t*)ptarget)[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skunk_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -189,12 +189,12 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -202,11 +202,9 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -214,10 +212,8 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -166,7 +166,7 @@ void x13sm3_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
@@ -182,7 +182,7 @@ void x13sm3_4way_hash( void *state, const void *input )
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
// Hamsi parallel 4x32x2
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -214,12 +214,12 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -227,11 +227,9 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
blake512_4way_init( &x13sm3_ctx_mid );
blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
@@ -242,10 +240,8 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13sm3_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -43,7 +43,7 @@ void polytimos_4way_hash( void *output, const void *input )
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
@@ -105,6 +105,7 @@ int scanhash_polytimos_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -117,7 +118,9 @@ int scanhash_polytimos_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

View File

@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
@@ -82,31 +82,27 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int i=0; i < 19; i++ )
{
be32enc( &endiandata[i], pdata[i] );
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
veltor_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -183,10 +183,9 @@ void x14_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, state );
}
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
@@ -194,12 +193,12 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -207,11 +206,9 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -219,10 +216,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x14_4way_hash( hash, vdata );
pdata[19] = n;
@@ -234,7 +229,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{

View File

@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -217,13 +217,13 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -231,11 +231,10 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
@@ -243,10 +242,8 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x15_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -248,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );

View File

@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -807,9 +807,10 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -823,7 +824,9 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
@@ -837,7 +840,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -124,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128_512( hash0, hash1, vhashA );
mm256_dintrlv_2x128_512( hash2, hash3, vhashB );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 4 way 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
@@ -206,9 +206,10 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -222,7 +223,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[ m ];
@@ -235,7 +238,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -161,13 +161,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// Parallel 4way 32 bit
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
// Serial
sph_whirlpool_init( &ctx.whirlpool );
@@ -295,13 +295,13 @@ void xevan_4way_hash( void *output, const void *input )
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
@@ -333,9 +333,10 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
@@ -348,7 +349,9 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm256_bswap_intrlv80_4x64( vdata, pdata );
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
// mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
@@ -357,7 +360,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;

View File

@@ -18,7 +18,7 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.3.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.5'
PACKAGE_STRING='cpuminer-opt 3.9.5'
PACKAGE_VERSION='3.9.5.3'
PACKAGE_STRING='cpuminer-opt 3.9.5.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.5.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.5.3:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.5
cpuminer-opt configure 3.9.5.3
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.5, which was
It was created by cpuminer-opt $as_me 3.9.5.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.5'
VERSION='3.9.5.3'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.5, which was
This file was extended by cpuminer-opt $as_me 3.9.5.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.5
cpuminer-opt config.status 3.9.5.3
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.5])
AC_INIT([cpuminer-opt], [3.9.5.3])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -178,7 +178,7 @@ static char const short_options[] =
#endif
"a:b:Bc:CDf:hm:n:p:Px:qr:R:s:t:T:o:u:O:V";
static struct work g_work = {{ 0 }};
static struct work g_work __attribute__ ((aligned (64))) = {{ 0 }};
//static struct work tmp_work;
time_t g_work_time = 0;
static pthread_mutex_t g_work_lock;
@@ -843,92 +843,160 @@ void scale_hash_for_display ( double* hashrate, char* units )
const uint64_t diff2hash = 0x40000000ULL;
static struct timeval submit_time, prev_submit_time;
static struct timeval submit_interval;
static struct timeval five_min_start;
static double shash_sum = 0.;
static double bhash_sum = 0.;
static double time_sum = 0.;
static double latency_sum = 0.;
static uint64_t submits_sum = 0;
static struct timeval five_min_start;
static double shash_sum = 0.;
static double bhash_sum = 0.;
static double time_sum = 0.;
static double latency_sum = 0.;
static uint64_t submit_sum = 0;
static uint64_t reject_sum = 0;
static int share_result( int result, struct work *work, const char *reason )
struct share_stats_t
{
char hr[16];
const char *sres;
double hashcount = 0.;
double hashrate = 0.;
struct timeval submit_time;
double net_diff;
double share_diff;
char job_id[32];
};
// with more and more parallelism the chances of submitting multiple
// shares in a very short time grows.
#define s_stats_size 8
static struct share_stats_t share_stats[ s_stats_size ];
static int s_get_ptr = 0, s_put_ptr = 0;
static struct timeval last_submit_time = {0};
static inline int stats_ptr_incr( int p )
{
return ++p < s_stats_size ? p : 0;
}
static int share_result( int result, struct work *null_work,
const char *reason )
{
double share_time = 0., share_hash = 0., block_hash = 0., share_size = 0.;
double hashcount = 0., hashrate = 0.;
uint64_t latency = 0;
struct share_stats_t my_stats = {0};
struct timeval ack_time, latency_tv, et;
char hr[32];
char hr_units[4] = {0};
bool solved;
char shr[16];
char shr[32];
char shr_units[4] = {0};
char diffstr[32];
struct timeval ack_time, latency_tv;
uint64_t latency;
double share_time, share_hash, block_hash;
double sharediff = work ? work->sharediff : stratum.sharediff;
double share_size;
const char *sres = NULL;
bool solved = false;
// Mutex while we grab asnapshot of the global counters.
pthread_mutex_lock( &stats_lock );
// When submit_work detects a buffer overflow it discards the stats for
// the new share. When we catch up we may get acks for shares with
// no stats. Leaving the get pointer un-incremented will resync with the
// put pointer.
if ( share_stats[ s_get_ptr ].submit_time.tv_sec )
{
memcpy( &my_stats, &share_stats[ s_get_ptr], sizeof my_stats );
memset( &share_stats[ s_get_ptr ], 0, sizeof my_stats );
s_get_ptr = stats_ptr_incr( s_get_ptr );
pthread_mutex_unlock( &stats_lock );
}
else
{
pthread_mutex_unlock( &stats_lock );
applog(LOG_WARNING,"Pending shares overflow, stats for share are lost.");
}
for ( int i = 0; i < opt_n_threads; i++ )
{
hashcount += thr_hashcount[i];
hashrate += thr_hashrates[i];
}
global_hashcount = hashcount;
global_hashrate = hashrate;
// calculate latency
gettimeofday( &ack_time, NULL );
timeval_subtract( &latency_tv, &ack_time, &submit_time );
latency = ( latency_tv.tv_sec * 1000 + latency_tv.tv_usec / 1000 );
// calculate latency and share time.
if ( my_stats.submit_time.tv_sec )
{
gettimeofday( &ack_time, NULL );
timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time );
latency = ( latency_tv.tv_sec * 1000 + latency_tv.tv_usec / 1000 );
timeval_subtract( &et, &my_stats.submit_time, &last_submit_time );
share_time = (double)et.tv_sec + ( (double)et.tv_usec / 1000000. );
memcpy( &last_submit_time, &my_stats.submit_time,
sizeof last_submit_time );
}
// calculate share hashrate and size
share_time = submit_interval.tv_sec + ( submit_interval.tv_usec / 1000000. );
share_hash = sharediff * diff2hash;
block_hash = net_diff * diff2hash;
share_size = block_hash == 0. ? 0. : share_hash / block_hash;
share_hash = my_stats.share_diff * diff2hash;
block_hash = my_stats.net_diff * diff2hash;
share_size = block_hash == 0. ? 0. : share_hash / block_hash * 100.;
// check result
result ? accepted_share_count++ : rejected_share_count++;
solved = result && (my_stats.net_diff > 0.0 )
&& ( my_stats.share_diff >= net_diff );
solved_block_count += solved ? 1 : 0 ;
// update counters for 5 minute summary report
pthread_mutex_lock( &stats_lock );
shash_sum += share_hash;
bhash_sum += block_hash;
time_sum += share_time;
submits_sum ++;
submit_sum ++;
reject_sum += (uint64_t)!result;
latency_sum += latency;
pthread_mutex_unlock( &stats_lock );
double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
double scaled_shr;
result ? accepted_share_count++ : rejected_share_count++;
global_hashcount = hashcount;
global_hashrate = hashrate;
// check for solved block
solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
solved_block_count += solved ? 1 : 0 ;
scaled_shr = share_hash_rate;
scale_hash_for_display ( &scaled_shr, shr_units );
if ( use_colors )
{
sres = ( solved ? ( CL_MAG "BLOCK SOLVED" CL_WHT )
: result ? ( CL_GRN "Accepted" CL_WHT )
: ( CL_RED "Rejected" CL_WHT ) );
: result ? ( CL_GRN "Accepted" CL_WHT )
: ( CL_RED "Rejected" CL_WHT ) );
// colour code the share diff to highlight high value.
if ( solved )
sprintf( diffstr, "%s%.3g%s", CL_MAG, sharediff, CL_WHT );
else if ( share_size > 0.01 )
sprintf( diffstr, "%s%.3g%s", CL_GRN, sharediff, CL_WHT );
else if ( share_size > 0.001 )
sprintf( diffstr, "%s%.3g%s", CL_CYN, sharediff, CL_WHT );
else if ( share_hash_rate > hashrate )
sprintf( diffstr, "%s%.3g%s", CL_YLW, sharediff, CL_WHT );
sprintf( diffstr, "%s%.3g%s", CL_MAG, my_stats.share_diff, CL_WHT );
else if ( my_stats.share_diff > ( my_stats.net_diff * 0.1 ) )
sprintf( diffstr, "%s%.3g%s", CL_GRN, my_stats.share_diff, CL_WHT );
else if ( my_stats.share_diff > ( my_stats.net_diff * 0.01 ) )
sprintf( diffstr, "%s%.3g%s", CL_CYN, my_stats.share_diff, CL_WHT );
else
sprintf( diffstr, "%.3g", sharediff );
sprintf( diffstr, "%.3g", my_stats.share_diff );
if ( hashrate ) // don't colour share hash rate without reference rate.
{
if ( share_hash_rate > 768. * hashrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > 32. * hashrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > 2.0 * hashrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shr, shr_units,
CL_WHT );
else if ( share_hash_rate > 0.5 * hashrate )
sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units );
else
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shr, shr_units,
CL_WHT );
}
else
sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units );
}
else
else // monochrome
{
sres = ( solved ? "BLOCK SOLVED"
: result ? "Accepted" : "Rejected" );
sprintf( diffstr, "%3g", sharediff );
sres = ( solved ? "BLOCK SOLVED" : result ? "Accepted" : "Rejected" );
sprintf( diffstr, "%.3g", my_stats.share_diff );
sprintf( shr, "%.2f %sH/s", scaled_shr, shr_units );
}
scale_hash_for_display ( &hashrate, hr_units );
@@ -941,36 +1009,20 @@ static int share_result( int result, struct work *work, const char *reason )
sres, diffstr, share_time, accepted_share_count,
rejected_share_count, solved_block_count );
if ( have_stratum && result && sharediff && net_diff && !opt_quiet )
if ( have_stratum && result && my_stats.share_diff && my_stats.net_diff
&& !opt_quiet )
{
// double share_hash_rate = share_time == 0. ? 0. : share_hash / share_time;
scale_hash_for_display ( &share_hash_rate, shr_units );
if ( share_hash_rate < 10 )
// very low hashrate, add digits
sprintf( shr, "%.4f", share_hash_rate );
else
sprintf( shr, "%.2f", share_hash_rate );
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s %sH/s, Latency %d ms.",
hr, hr_units, shr, shr_units, latency );
applog( LOG_NOTICE, "Height %d, Block share %.5f%%.",
stratum.bloc_height, share_size*100. );
applog( LOG_NOTICE, "Miner %s %sH/s, Share %s, Latency %d ms.",
hr, hr_units, shr, latency );
applog( LOG_NOTICE, "Height %d, job %s, %.5f%% block share.",
stratum.bloc_height, my_stats.job_id, share_size );
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
}
if ( reason )
{
applog( LOG_WARNING, "reject reason: %s", reason );
/*
if (strncmp(reason, "low difficulty share", 20) == 0)
{
opt_diff_factor = (opt_diff_factor * 2.0) / 3.0;
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_diff_factor);
return 0;
}
*/
}
return 1;
applog( LOG_WARNING, "reject reason: %s.", reason );
return 1;
}
void std_le_build_stratum_request( char *req, struct work *work )
@@ -1557,9 +1609,21 @@ bool submit_work(struct thr_info *thr, const struct work *work_in)
{
struct workio_cmd *wc;
memcpy( &prev_submit_time, &submit_time, sizeof submit_time );
gettimeofday( &submit_time, NULL );
timeval_subtract( &submit_interval, &submit_time, &prev_submit_time );
// collect some share stats
pthread_mutex_lock( &stats_lock );
// if buffer full discard stats and don't increment pointer.
// We're on the clock so let share_result report it.
if ( share_stats[ s_put_ptr ].submit_time.tv_sec == 0 )
{
gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL );
share_stats[ s_put_ptr ].share_diff = work_in->sharediff;
share_stats[ s_put_ptr ].net_diff = net_diff;
strcpy( share_stats[ s_put_ptr ].job_id, work_in->job_id );
s_put_ptr = stats_ptr_incr( s_put_ptr );
}
pthread_mutex_unlock( &stats_lock );
/* fill out work request message */
wc = (struct workio_cmd *) calloc(1, sizeof(*wc));
@@ -1723,6 +1787,7 @@ uint32_t* jr2_get_nonceptr( uint32_t *work_data )
return (uint32_t*) ( ((uint8_t*) work_data) + algo_gate.nonce_index );
}
void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
@@ -1735,10 +1800,11 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
// or
// || ( !benchmark && strcmp( work->job_id, g_work->job_id ) ) ) )
// For now leave it as is, it seems stable.
// strtoul seems to work.
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| strtoul( work->job_id, NULL, 16 )
!= strtoul( g_work->job_id, NULL, 16 ) ) )
{
work_free( work );
work_copy( work, g_work );
@@ -1786,9 +1852,9 @@ bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
static void *miner_thread( void *userdata )
{
struct work work __attribute__ ((aligned (64))) ;
struct thr_info *mythr = (struct thr_info *) userdata;
int thr_id = mythr->id;
struct work work;
uint32_t max_nonce;
struct timeval et;
struct timeval time_now;
@@ -1912,7 +1978,7 @@ static void *miner_thread( void *userdata )
if ( have_stratum )
{
algo_gate.wait_for_diff( &stratum );
pthread_mutex_lock( &g_work_lock );
pthread_mutex_lock( &g_work_lock );
if ( *algo_gate.get_nonceptr( work.data ) >= end_nonce )
algo_gate.stratum_gen_work( &stratum, &g_work );
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce,
@@ -1922,20 +1988,20 @@ static void *miner_thread( void *userdata )
else
{
int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
pthread_mutex_lock( &g_work_lock );
pthread_mutex_lock( &g_work_lock );
if ( time(NULL) - g_work_time >= min_scantime
|| *algo_gate.get_nonceptr( work.data ) >= end_nonce )
{
if ( unlikely( !get_work( mythr, &g_work ) ) )
if ( unlikely( !get_work( mythr, &g_work ) ) )
{
applog( LOG_ERR, "work retrieval failed, exiting "
"mining thread %d", thr_id );
applog( LOG_ERR, "work retrieval failed, exiting "
"mining thread %d", thr_id );
pthread_mutex_unlock( &g_work_lock );
goto out;
}
goto out;
}
g_work_time = time(NULL);
}
}
algo_gate.get_new_work( &work, &g_work, thr_id, &end_nonce, true );
pthread_mutex_unlock( &g_work_lock );
@@ -2023,7 +2089,9 @@ static void *miner_thread( void *userdata )
break;
}
if ( !opt_quiet )
applog( LOG_NOTICE, "Share submitted." );
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
accepted_share_count + rejected_share_count + 1,
mythr->id, work.job_id );
// prevent stale work in solo
// we can't submit twice a block!
@@ -2035,6 +2103,7 @@ static void *miner_thread( void *userdata )
pthread_mutex_unlock( &g_work_lock );
}
}
// Check for 5 minute summary report, mutex until global counters
// are read and reset. It's bad form to unlock inside a conditional
// block but more efficient. The logic is reversed to make the mutex
@@ -2047,78 +2116,157 @@ static void *miner_thread( void *userdata )
pthread_mutex_unlock( &stats_lock );
else
{
// collect and reset counters
// collect and reset global counters
double hash = shash_sum; shash_sum = 0.;
double bhash = bhash_sum; bhash_sum = 0.;
double time = time_sum; time_sum = 0.;
uint64_t submits = submits_sum; submits_sum = 0;
uint64_t submits = submit_sum; submit_sum = 0;
uint64_t rejects = reject_sum; reject_sum = 0;
uint64_t latency = latency_sum; latency_sum = 0;
memcpy( &five_min_start, &time_now, sizeof time_now );
pthread_mutex_unlock( &stats_lock );
char hr[16];
char hr_units[4] = {0};
char bshstr[32];
double hrate = time == 0. ? 0. : hash / time;
double avg_share = bhash == 0. ? 0. : hash / bhash * 100.;
latency = submits ? latency / submits : 0;
double ghrate = global_hashrate;
double scaled_ghrate = ghrate;
double shrate = time == 0. ? 0. : hash / time;
double scaled_shrate = shrate;
double avg_share = bhash == 0. ? 0. : hash / bhash * 100.;
uint64_t avg_latency = 0;
double latency_pc = 0.;
double rejects_pc = 0.;
double submit_rate = 0.;
char shr[32];
char shr_units[4] = {0};
char ghr[32];
char ghr_units[4] = {0};
int temp = cpu_temp(0);
char tempstr[32];
// colour code the block share to highlight high value.
if ( avg_share > 90.0 )
sprintf( bshstr, "%s%.5f%s", CL_MAG, avg_share, CL_WHT );
else if ( avg_share > 1.0 )
sprintf( bshstr, "%s%.5f%s", CL_GRN, avg_share, CL_WHT );
else if ( avg_share > 0.1 )
sprintf( bshstr, "%s%.5f%s", CL_CYN, avg_share, CL_WHT );
else if ( hrate > global_hashrate )
sprintf( bshstr, "%s%.5f%s", CL_YLW, avg_share, CL_WHT );
else
sprintf( bshstr, "%.5f", avg_share );
scale_hash_for_display ( &hrate, hr_units );
if ( hrate < 10. )
// very low hashrate, add digits
sprintf( hr, "%.4f", hrate );
else
sprintf( hr, "%.2f", hrate );
if ( submits )
avg_latency = latency / submits;
applog(LOG_NOTICE,"Summary: %d submits in %dm%02ds, block share %s%%.",
(uint64_t)submits, et.tv_sec / 60,
et.tv_sec % 60, bshstr );
if ( time != 0. )
{
submit_rate = (double)submits*60. / time;
rejects_pc = (double)rejects / (time*10.);
latency_pc = (double)latency / ( time*10.);
}
scale_hash_for_display( &scaled_shrate, shr_units );
scale_hash_for_display( &scaled_ghrate, ghr_units );
sprintf( ghr, "%.2f %sH/s", scaled_ghrate, ghr_units );
if ( use_colors )
{
if ( shrate > (128.*ghrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_MAG, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > (16.*ghrate) )
sprintf( shr, "%s%.2f %sH/s%s", CL_GRN, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > 2.0*ghrate )
sprintf( shr, "%s%.2f %sH/s%s", CL_CYN, scaled_shrate,
shr_units, CL_WHT );
else if ( shrate > 0.5*ghrate )
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
else
sprintf( shr, "%s%.2f %sH/s%s", CL_YLW, scaled_shrate,
shr_units, CL_WHT );
if ( temp >= 80 ) sprintf( tempstr, "%s%d C%s",
CL_RED, temp, CL_WHT );
else if (temp >=70 ) sprintf( tempstr, "%s%d C%s",
CL_YLW, temp, CL_WHT );
else sprintf( tempstr, "%d C", temp );
}
else
{
sprintf( shr, "%.2f %sH/s", scaled_shrate, shr_units );
sprintf( tempstr, "%d C", temp );
}
applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds.",
(uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60 );
applog(LOG_NOTICE,"%d rejects (%.2f%%), %.5f%% block share.",
rejects, rejects_pc, avg_share );
applog(LOG_NOTICE,"Avg hashrate: Miner %s, Share %s.", ghr, shr );
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog(LOG_NOTICE,"Shares/min: %.2f, latency %d ms (%.2f%%).",
submit_rate, avg_latency, latency_pc );
#else
applog(LOG_NOTICE,"Shares/min: %.2f, latency %d ms (%.2f%%), temp: %s.",
submit_rate, avg_latency, latency_pc, tempstr );
#endif
/*
applog(LOG_NOTICE,"Submitted %d shares in %dm%02ds, %.5f%% block share.",
(uint64_t)submits, et.tv_sec / 60, et.tv_sec % 60, avg_share );
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms.",
hr, hr_units, latency );
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms (%.2f%%).",
shr, avg_latency, latency_pc );
#else
applog(LOG_NOTICE,"Share hashrate %s %sH/s, latency %d ms, temp %dC.",
hr, hr_units, latency, (uint32_t)cpu_temp(0) );
applog(LOG_NOTICE,"Share hashrate %s, latency %d ms (%.2f%%), temp %s.",
shr, avg_latency, latency_pc, tempstr );
#endif
*/
applog(LOG_INFO,"- - - - - - - - - - - - - - - - - - - - - - - - - - -");
}
// display hashrate
if ( opt_hash_meter )
if ( !opt_quiet )
{
char hc[16];
char hr[16];
char hc_units[2] = {0,0};
char hr_units[2] = {0,0};
double hashcount = thr_hashcount[thr_id];
double hashrate = thr_hashrates[thr_id];
if ( hashcount )
double hashcount;
double hashrate;
if ( opt_hash_meter )
{
scale_hash_for_display( &hashcount, hc_units );
scale_hash_for_display( &hashrate, hr_units );
if ( hc_units[0] )
sprintf( hc, "%.2f", hashcount );
else // no fractions of a hash
sprintf( hc, "%.0f", hashcount );
sprintf( hr, "%.2f", hashrate );
applog( LOG_INFO, "CPU #%d: %s %sH, %s %sH/s",
thr_id, hc, hc_units, hr, hr_units );
hashcount = thr_hashcount[thr_id];
hashrate = thr_hashrates[thr_id];
if ( hashcount != 0. )
{
scale_hash_for_display( &hashcount, hc_units );
scale_hash_for_display( &hashrate, hr_units );
if ( hc_units[0] )
sprintf( hc, "%.2f", hashcount );
else // no fractions of a hash
sprintf( hc, "%.0f", hashcount );
sprintf( hr, "%.2f", hashrate );
applog( LOG_INFO, "CPU #%d: %s %sH, %s %sH/s",
thr_id, hc, hc_units, hr, hr_units );
}
}
if ( thr_id == 0 )
{
hashcount = 0.;
hashrate = 0.;
for ( i = 0; i < opt_n_threads; i++ )
{
hashrate += thr_hashrates[i];
hashcount += thr_hashcount[i];
}
if ( hashcount != 0. )
{
scale_hash_for_display( &hashcount, hc_units );
scale_hash_for_display( &hashrate, hr_units );
if ( hc_units[0] )
sprintf( hc, "%.2f", hashcount );
else // no fractions of a hash
sprintf( hc, "%.0f", hashcount );
sprintf( hr, "%.2f", hashrate );
applog( LOG_NOTICE, "Miner perf: %s %sH, %s %sH/s.",
hc, hc_units, hr, hr_units );
}
}
}
// Display benchmark total
// Update hashrate for API if no shares accepted yet.
if ( ( opt_benchmark || !accepted_share_count )
@@ -2131,7 +2279,7 @@ static void *miner_thread( void *userdata )
hashrate += thr_hashrates[i];
hashcount += thr_hashcount[i];
}
if ( hashcount )
if ( hashcount != 0. )
{
global_hashcount = hashcount;
global_hashrate = hashrate;
@@ -2294,7 +2442,7 @@ start:
sprintf(netinfo, ", diff %.3f", net_diff);
}
if (opt_showdiff)
sprintf( &netinfo[strlen(netinfo)], ", target %.3f",
sprintf( &netinfo[strlen(netinfo)], ", target %.3f",
g_work.targetdiff );
applog(LOG_BLUE, "%s detected new block%s", short_url, netinfo);
}
@@ -2383,8 +2531,8 @@ static bool stratum_handle_response( char *buf )
val = JSON_LOADS( buf, &err );
if (!val)
{
applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
{
applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
goto out;
}
res_val = json_object_get( val, "result" );
@@ -2393,8 +2541,8 @@ static bool stratum_handle_response( char *buf )
id_val = json_object_get( val, "id" );
if ( !id_val || json_is_null(id_val) )
goto out;
if ( !algo_gate.stratum_handle_response( val ) )
goto out;
if ( !algo_gate.stratum_handle_response( val ) )
goto out;
ret = true;
out:
if (val)
@@ -2457,6 +2605,9 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
algo_gate.set_work_data_endian( g_work );
pthread_mutex_unlock( &sctx->work_lock );
// if ( !opt_quiet )
// applog( LOG_BLUE,"New job %s.", g_work->job_id );
if ( opt_debug )
{
unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
@@ -2470,14 +2621,14 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
if ( stratum_diff != sctx->job.diff )
{
char sdiff[32] = { 0 };
// char sdiff[32] = { 0 };
// store for api stats
stratum_diff = sctx->job.diff;
if ( opt_showdiff && g_work->targetdiff != stratum_diff )
if ( !opt_quiet && opt_showdiff && g_work->targetdiff != stratum_diff )
{
snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
applog( LOG_WARNING, "Stratum difficulty set to %g%s", stratum_diff,
sdiff );
// snprintf( sdiff, 32, " (%.5f)", g_work->targetdiff );
applog( LOG_BLUE, "Stratum difficulty set to %g", stratum_diff );
// sdiff );
}
}
}
@@ -2492,114 +2643,118 @@ void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
static void *stratum_thread(void *userdata )
{
struct thr_info *mythr = (struct thr_info *) userdata;
char *s;
struct thr_info *mythr = (struct thr_info *) userdata;
char *s;
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
while (1)
{
int failures = 0;
while (1)
{
int failures = 0;
if ( stratum_need_reset )
{
stratum_need_reset = false;
stratum_disconnect( &stratum );
if ( strcmp( stratum.url, rpc_url ) )
{
free( stratum.url );
stratum.url = strdup( rpc_url );
applog(LOG_BLUE, "Connection changed to %s", short_url);
}
else if ( !opt_quiet )
applog(LOG_DEBUG, "Stratum connection reset");
}
if ( stratum_need_reset )
{
stratum_need_reset = false;
stratum_disconnect( &stratum );
if ( strcmp( stratum.url, rpc_url ) )
{
free( stratum.url );
stratum.url = strdup( rpc_url );
applog(LOG_BLUE, "Connection changed to %s", short_url);
}
else if ( !opt_quiet )
applog(LOG_DEBUG, "Stratum connection reset");
}
while ( !stratum.curl )
{
pthread_mutex_lock( &g_work_lock );
g_work_time = 0;
pthread_mutex_unlock( &g_work_lock );
restart_threads();
if ( !stratum_connect( &stratum, stratum.url )
|| !stratum_subscribe( &stratum )
|| !stratum_authorize( &stratum, rpc_user, rpc_pass ) )
{
stratum_disconnect( &stratum );
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
while ( !stratum.curl )
{
pthread_mutex_lock( &g_work_lock );
g_work_time = 0;
pthread_mutex_unlock( &g_work_lock );
restart_threads();
if ( !stratum_connect( &stratum, stratum.url )
|| !stratum_subscribe( &stratum )
|| !stratum_authorize( &stratum, rpc_user, rpc_pass ) )
{
stratum_disconnect( &stratum );
if (opt_retries >= 0 && ++failures > opt_retries)
{
applog(LOG_ERR, "...terminating workio thread");
tq_push(thr_info[work_thr_id].q, NULL);
goto out;
}
if (!opt_benchmark)
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if (jsonrpc_2)
{
work_free(&g_work);
work_copy(&g_work, &stratum.work);
}
}
if ( stratum.job.job_id &&
( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
// restart_threads();
if ( stratum.job.job_id
&& ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
{
pthread_mutex_lock(&g_work_lock);
algo_gate.stratum_gen_work( &stratum, &g_work );
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
restart_threads();
if (stratum.job.clean || jsonrpc_2)
{
static uint32_t last_bloc_height;
if ( last_bloc_height != stratum.bloc_height )
{
last_bloc_height = stratum.bloc_height;
if ( !opt_quiet )
{
if (net_diff > 0.)
applog(LOG_BLUE, "%s block %d, network diff %.3f",
algo_names[opt_algo], stratum.bloc_height, net_diff);
else
applog(LOG_BLUE, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.bloc_height);
}
}
restart_threads();
}
else if (opt_debug && !opt_quiet)
{
applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
}
} // stratum.job.job_id
if ( stratum.job.clean || jsonrpc_2 )
{
static uint32_t last_bloc_height;
if ( last_bloc_height != stratum.bloc_height )
{
last_bloc_height = stratum.bloc_height;
if ( !opt_quiet )
{
if ( net_diff > 0. )
applog( LOG_BLUE,
"%s block %d, job %s, network diff %.4f",
algo_names[opt_algo], stratum.bloc_height,
g_work.job_id, net_diff);
else
applog( LOG_BLUE, "%s %s block %d, job %s",
short_url, algo_names[opt_algo],
stratum.bloc_height, g_work.job_id );
}
}
else if ( !opt_quiet )
applog( LOG_BLUE,"New job %s.", g_work.job_id );
}
else if (opt_debug && !opt_quiet)
{
applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
strtoul( stratum.job.job_id, NULL, 16 ), stratum.bloc_height );
}
} // stratum.job.job_id
if ( !stratum_socket_full( &stratum, opt_timeout ) )
{
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
}
else
s = stratum_recv_line(&stratum);
if ( !s )
{
stratum_disconnect(&stratum);
if ( !stratum_socket_full( &stratum, opt_timeout ) )
{
applog(LOG_ERR, "Stratum connection timeout");
s = NULL;
}
else
s = stratum_recv_line(&stratum);
if ( !s )
{
stratum_disconnect(&stratum);
// applog(LOG_WARNING, "Stratum connection interrupted");
continue;
}
if (!stratum_handle_method(&stratum, s))
continue;
}
if (!stratum_handle_method(&stratum, s))
stratum_handle_response(s);
free(s);
} // loop
free(s);
} // loop
out:
return NULL;
return NULL;
}
void show_version_and_exit(void)
@@ -3402,23 +3557,23 @@ int main(int argc, char *argv[])
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
// Initialize stats times and counters
gettimeofday( &prev_submit_time, NULL );
memcpy( &submit_time, &prev_submit_time, sizeof submit_time );
memcpy( &five_min_start, &prev_submit_time, sizeof prev_submit_time );
memset( share_stats, 0, 2 * sizeof (struct share_stats_t) );
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
if ( !check_cpu_capability() ) exit(1);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
pthread_mutex_init(&rpc2_job_lock, NULL);
pthread_mutex_init(&rpc2_login_lock, NULL);
pthread_mutex_init(&stratum.sock_lock, NULL);
pthread_mutex_init(&stratum.work_lock, NULL);
pthread_mutex_init( &stats_lock, NULL );
pthread_mutex_init( &g_work_lock, NULL );
pthread_mutex_init( &rpc2_job_lock, NULL );
pthread_mutex_init( &rpc2_login_lock, NULL );
pthread_mutex_init( &stratum.sock_lock, NULL );
pthread_mutex_init( &stratum.work_lock, NULL );
flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 )
? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL )
: CURL_GLOBAL_ALL;
if (curl_global_init(flags))
if ( curl_global_init( flags ) )
{
applog(LOG_ERR, "CURL initialization failed");
return 1;

View File

@@ -349,7 +349,7 @@ void cpu_brand_string( char* s );
float cpu_temp( int core );
struct work {
uint32_t data[48];
uint32_t data[48] __attribute__ ((aligned (64)));
uint32_t target[8];
double targetdiff;
@@ -401,7 +401,7 @@ struct stratum_ctx {
unsigned char *xnonce1;
size_t xnonce2_size;
struct stratum_job job;
struct work work;
struct work work __attribute__ ((aligned (64)));
pthread_mutex_t work_lock;
int bloc_height;

View File

@@ -174,30 +174,41 @@
#if defined(__MMX__)
// 64 bit vectors
#include "simd-utils/simd-mmx.h"
#include "simd-utils/simd-64.h"
#include "simd-utils/intrlv-mmx.h"
#if defined(__SSE2__)
// 128 bit vectors
#include "simd-utils/simd-sse2.h"
#include "simd-utils/simd-128.h"
#include "simd-utils/intrlv-sse2.h"
#if defined(__AVX__)
// 256 bit vector basics
#include "simd-utils/simd-256.h"
#include "simd-utils/intrlv-avx.h"
#if defined(__AVX2__)
// 256 bit vectors
#include "simd-utils/simd-avx2.h"
// 256 bit everything else
//#include "simd-utils/simd-avx2.h"
#include "simd-utils/intrlv-avx2.h"
// Skylake-X has all these
#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// 512 bit vectors
#include "simd-utils/simd-avx512.h"
#include "simd-utils/simd-512.h"
#include "simd-utils/intrlv-avx512.h"
#endif // MMX
#endif // SSE2
#endif // AVX
#endif // AVX2
#endif // AVX512
// Picks implementation based on available CPU features.
#include "simd-utils/intrlv-selector.h"
#endif // SIMD_UTILS_H__

911
simd-utils/intrlv-avx.h Normal file
View File

@@ -0,0 +1,911 @@
#if !defined(INTRLV_AVX_H__)
#define INTRLV_AVX_H__ 1
// philosophical discussion
//
// transitions:
//
// int32 <-> int64
// uint64_t = (uint64_t)int32_lo | ( (uint64_t)int32_hi << 32 )
// Efficient transition and post processing, 32 bit granularity is lost.
//
// int32 <-> m64
// More complex, 32 bit granularity maintained, limited number of mmx regs.
// int32 <-> int64 <-> m64 might be more efficient.
//
// int32 <-> m128
// Expensive, current implementation.
//
// int32 <-> m256
// Very expensive multi stage, current implementation.
//
// int64/m64 <-> m128
// Efficient, agnostic to native element size. Common.
//
// m128 <-> m256
// Expensive for a single instruction, unavoidable. Common.
//
// Multi stage options
//
// int32 <-> int64 -> m128
// More efficient than insert32, granularity maintained. Common.
//
// int64 <-> m128 -> m256
// Unavoidable, reasonably efficient. Common
//
// int32 <-> int64 -> m128 -> m256
// Seems inevitable, most efficient despite number of stages. Common.
//
// Implementation plan.
//
// 1. Complete m128 <-> m256
// 2. Implement int64 <-> m128
// 3. Combine int64 <-> m128 <-> m256
// 4. Implement int32 <-> int64 <-> m128
// 5. Combine int32 <-> int64 <-> m128 <-> m256
//
#if defined(__AVX__)
// Convenient short cuts for local use only
// Extract 64 bits from the low 128 bits of 256 bit vector.
#define extr64_cast128_256( a, n ) \
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
// Extract 32 bits from the low 128 bits of 256 bit vector.
#define extr32_cast128_256( a, n ) \
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
///////////////////////////////////////////////////////////
//
// AVX 256 Bit Vectors
//
// 256 bit interleaving can be done with AVX.
#define mm256_put_64( s0, s1, s2, s3) \
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
#define mm256_get_64( s, i0, i1, i2, i3 ) \
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
/*
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
#define mm256_intrlv_blend_128( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x0f )
#define mm256_intrlv_blend_64( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x33 )
#define mm256_intrlv_blend_32( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x55 )
*/
// Interleave 8x32_256
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
casti_m256i( d,0 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
casti_m256i( d,4 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
casti_m256i( d,5 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
casti_m256i( d,6 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
casti_m256i( d,7 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
} while(0)
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
} while(0)
/*
#define mm256_bswap_intrlv_8x32_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 0 ) ); \
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 1 ) ); \
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 2 ) ); \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( \
_mm256_castsi256_si128( s0 ), 3 ) ); \
casti_m256i( d, 4 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 0 ) ); \
casti_m256i( d, 5 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 1 ) ); \
casti_m256i( d, 6 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 2 ) ); \
casti_m256i( d, 7 ) = _mm256_set1_epi32( _mm_extract_epi32( s1, 3 ) ); \
} while(0)
#define mm256_bswap_intrlv_8x32_128( d, src ) \
do { \
__m128i ss = mm128_bswap_32( src ); \
casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 0 ) ); \
casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 1 ) ); \
casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 2 ) ); \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
} while(0)
*/
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m256i s0 = casti_m256i(s,0); \
__m256i s1 = casti_m256i(s,1); \
__m256i s2 = casti_m256i(s,2); \
__m256i s3 = casti_m256i(s,3); \
__m256i s4 = casti_m256i(s,4); \
__m256i s5 = casti_m256i(s,5); \
__m256i s6 = casti_m256i(s,6); \
__m256i s7 = casti_m256i(s,7); \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
d0 = _mm256_set_epi32( \
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
d1 = _mm256_set_epi32( \
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
d2 = _mm256_set_epi32( \
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
d3 = _mm256_set_epi32( \
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
d4 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
d5 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
d6 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
d7 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
} while(0)
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m128i s0 = casti_m128i(s,0); \
__m128i s1 = casti_m128i(s,1); \
__m128i s2 = casti_m128i(s,2); \
__m128i s3 = casti_m128i(s,3); \
d0 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d1 = _mm_set_epi32( \
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
d2 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d3 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d4 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d5 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d6 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d7 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
} while(0)
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
do { \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
} while(0)
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
} while(0)
/*
#define mm256_bswap_intrlv_4x64_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
__m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
_mm256_castsi256_si128( s0 ), 0 ) ); \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
_mm256_castsi256_si128( s0 ), 1 ) ); \
casti_m256i( d,2 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 0 ) ); \
casti_m256i( d,3 ) = _mm256_set1_epi64x( _mm_extract_epi64( s1, 1 ) ); \
} while(0)
#define mm256_bswap_intrlv_4x64_128( d, src ) \
do { \
__m128i ss = mm128_bswap_32( src ); \
casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
} while(0)
*/
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
casti_m256i( d0,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
casti_m256i( d1,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
casti_m256i( d2,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
casti_m256i( d3,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
}
// quarter avx2 block, 16 bytes * 4 lanes
// 4 lanes of 128 bits using 64 bit interleaving
// Used for last 16 bytes of 80 byte input, only used for testing.
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src );
__m256i s1 = *( (__m256i*)(src+32) );
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
extr64_cast128_256( s0 , 0 ) );
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
extr64_cast128_256( s0 , 1 ) );
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
_mm_extract_epi64( s0hi, 0 ) );
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
_mm_extract_epi64( s0hi, 1 ) );
}
/*
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
const int n, const void *s )
{
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
}
*/
//
#define mm256_intrlv_4x32_256( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 1 ), mm128_extr_32( s2, 1 ), \
mm128_extr_32( s1, 1 ), mm128_extr_32( s0, 1 ), \
mm128_extr_32( s3, 0 ), mm128_extr_32( s2, 0 ), \
mm128_extr_32( s1, 0 ), mm128_extr_32( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 3 ), mm128_extr_32( s2, 3 ), \
mm128_extr_32( s1, 3 ), mm128_extr_32( s0, 3 ), \
mm128_extr_32( s3, 2 ), mm128_extr_32( s2, 2 ), \
mm128_extr_32( s1, 2 ), mm128_extr_32( s0, 2 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 5 ), mm128_extr_32( s2, 5 ), \
mm128_extr_32( s1, 5 ), mm128_extr_32( s0, 5 ), \
mm128_extr_32( s3, 4 ), mm128_extr_32( s2, 4 ), \
mm128_extr_32( s1, 4 ), mm128_extr_32( s0, 4 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32( s3, 7 ), mm128_extr_32( s2, 7 ), \
mm128_extr_32( s1, 7 ), mm128_extr_32( s0, 7 ), \
mm128_extr_32( s3, 6 ), mm128_extr_32( s2, 6 ), \
mm128_extr_32( s1, 6 ), mm128_extr_32( s0, 6 ) ); \
} while(0)
// 256 bit versions of commmon 128 bit functions.
static inline void mm256_intrlv_4x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x32_256( d ,casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x32_256( d+128 ,casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm128_intrlv_4x32_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
mm256_intrlv_4x32_256( d+256 ,casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x32_256( d+384 ,casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
static inline void mm256_dintrlv_4x32_256( void *d0, void *d1, void *d2,
void *d3, const void *src )
{
__m256i s0 = *(__m256i*) src;
__m256i s1 = *(__m256i*)(src+32);
__m256i s2 = *(__m256i*)(src+64);
__m256i s3 = *(__m256i*)(src+96);
*(__m256i*)d0 = _mm256_set_epi32(
_mm256_extract_epi32( s3,4 ), _mm256_extract_epi32( s3,0 ),
_mm256_extract_epi32( s2,4 ), _mm256_extract_epi32( s2,0 ),
_mm256_extract_epi32( s1,4 ), _mm256_extract_epi32( s1,0 ),
_mm256_extract_epi32( s0,4 ), _mm256_extract_epi32( s0,0 ) );
*(__m256i*)d1 = _mm256_set_epi32(
_mm256_extract_epi32( s3,5 ), _mm256_extract_epi32( s3,1 ),
_mm256_extract_epi32( s2,5 ), _mm256_extract_epi32( s2,1 ),
_mm256_extract_epi32( s1,5 ), _mm256_extract_epi32( s1,1 ),
_mm256_extract_epi32( s0,5 ), _mm256_extract_epi32( s0,1 ) );
*(__m256i*)d2 = _mm256_set_epi32(
_mm256_extract_epi32( s3,6 ), _mm256_extract_epi32( s3,2 ),
_mm256_extract_epi32( s2,6 ), _mm256_extract_epi32( s2,2 ),
_mm256_extract_epi32( s1,6 ), _mm256_extract_epi32( s1,2 ),
_mm256_extract_epi32( s0,6 ), _mm256_extract_epi32( s0,2 ) );
*(__m256i*)d3 = _mm256_set_epi32(
_mm256_extract_epi32( s3,7 ), _mm256_extract_epi32( s3,3 ),
_mm256_extract_epi32( s2,7 ), _mm256_extract_epi32( s2,3 ),
_mm256_extract_epi32( s1,7 ), _mm256_extract_epi32( s1,3 ),
_mm256_extract_epi32( s0,7 ), _mm256_extract_epi32( s0,3 ) );
}
static inline void mm256_dintrlv_4x32( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x32_256( d0 , d1 , d2 , d3 , s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x32_256( d0+ 32, d1+ 32, d2+ 32, d3+ 32, s+128 );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm128_dintrlv_4x32_128( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
return;
}
mm256_dintrlv_4x32_256( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
mm256_dintrlv_4x32_256( d0+ 96, d1+ 96, d2+ 96, d3+ 96, s+384 );
}
static inline void mm256_extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_32( s, lane , lane+ 4, lane+ 8, lane+12,
lane+16, lane+20, lane+24, lane+28 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_32( s, lane+32, lane+36, lane+40, lane+44,
lane+48, lane+52, lane+56, lane+60 );
}
// Interleave 8 source buffers containing 32 bit data into the destination
// vector
static inline void mm256_intrlv_8x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
if ( bit_len <= 256 ) return;
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
return;
}
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
// bit_len == 1024
}
// A couple of mining specifi functions.
/*
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
{
mm256_bswap_intrlv_8x32_256( d , casti_m256i( s, 0 ) );
mm256_bswap_intrlv_8x32_256( d+256, casti_m256i( s, 1 ) );
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
}
*/
// Deinterleave 8 buffers of 32 bit data from the source buffer.
// Sub-function can be called directly for 32 byte final hash.
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
void *d3, void *d4, void *d5, void *d6, void *d7,
const void *s, int bit_len )
{
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
return;
}
// bitlen == 1024
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
}
static inline void mm256_extr_lane_8x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
lane+32, lane+ 40, lane+ 48, lane+ 56 );
if ( bit_len <= 256 ) return;
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
lane+96, lane+104, lane+112, lane+120 );
// bit_len == 512
}
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
static inline void mm256_intrlv_4x64( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
// bit_len == 1024
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
/*
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
{
mm256_bswap_intrlv_4x64_256( d , casti_m256i( s, 0 ) );
mm256_bswap_intrlv_4x64_256( d+128, casti_m256i( s, 1 ) );
mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
}
// Blend 32 byte lanes of hash from 2 sources according to control mask.
// macro due to 256 bit value arg.
#define mm256_blend_hash_4x64( dst, a, b, mask ) \
do { \
dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \
dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \
dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \
dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \
dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \
dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \
dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
} while(0)
*/
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
return;
}
// bit_len == 1024
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
}
// extract and deinterleave specified lane.
#define mm256_extr_lane_4x64_256 \
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
static inline void mm256_extr_lane_4x64( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
return;
}
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
// bit_len == 1024
}
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
// bit_len == 1024
}
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
const void *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
uint64_t *s = (uint64_t*)src;
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
if ( bit_len <= 256 ) return;
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
if ( bit_len <= 512 ) return;
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
}
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
if ( bit_len <= 256 ) return;
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
if ( bit_len <= 512 ) return;
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
}
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
void *s1, const int bit_len )
{
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0 ), 1 );
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0 ), 1 );
casti_m256i( d,0 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 256 ) return;
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1 ), 1 );
casti_m256i( d,2 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
return;
}
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2 ), 1 );
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3 ), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3 ), 1 );
casti_m256i( d,6 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
}
// 512 is the bit len used by most, eliminate the conditionals
static inline void mm256_dintrlv_2x128_512( void *dst0, void *dst1,
const void *s )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
}
// Phase out usage for all 512 bit data lengths
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
int bit_len )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[0] = _mm256_permute2x128_si256( s0, s1, 0x31 );
if ( bit_len <= 256 ) return;
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[1] = _mm256_permute2x128_si256( s0, s1, 0x31 );
if ( bit_len <= 512 ) return;
s0 = casti_m256i( s, 4 );
s1 = casti_m256i( s, 5 );
d0[2] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[2] = _mm256_permute2x128_si256( s0, s1, 0x31 );
s0 = casti_m256i( s, 6 );
s1 = casti_m256i( s, 7 );
d0[3] = _mm256_permute2x128_si256( s0, s1, 0x20 );
d1[3] = _mm256_permute2x128_si256( s0, s1, 0x31 );
}
#undef extr64_cast128_256
#undef extr32_cast128_256
#endif // AVX
#endif // INTRLV_AVX_H__

View File

@@ -1,42 +1,13 @@
#if !defined(INTRLV_AVX22_H__)
#if !defined(INTRLV_AVX2_H__)
#define INTRLV_AVX2_H__ 1
#if defined(__AVX2__)
// Convenient short cuts for local use only
// Extract 64 bits from the low 128 bits of 256 bit vector.
#define extr64_cast128_256( a, n ) \
_mm_extract_epi64( _mm256_castsi256_si128( a ), n )
// Extract 32 bits from the low 128 bits of 256 bit vector.
#define extr32_cast128_256( a, n ) \
_mm_extract_epi32( _mm256_castsi256_si128( a ), n )
///////////////////////////////////////////////////////////
//
// AVX2 256 Bit Vectors
//
#define mm256_put_64( s0, s1, s2, s3) \
_mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
*((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
_mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
*((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
*((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
*((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
#define mm256_get_64( s, i0, i1, i2, i3 ) \
_mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
_mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
// A few functions that need AVX2 for 256 bit.
// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
@@ -49,82 +20,6 @@
#define mm256_intrlv_blend_32( hi, lo ) \
_mm256_blend_epi32( hi, lo, 0x55 )
// Interleave 8x32_256
#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
__m128i s0hi = mm128_extr_hi128_256( s0 ); \
__m128i s1hi = mm128_extr_hi128_256( s1 ); \
__m128i s2hi = mm128_extr_hi128_256( s2 ); \
__m128i s3hi = mm128_extr_hi128_256( s3 ); \
__m128i s4hi = mm128_extr_hi128_256( s4 ); \
__m128i s5hi = mm128_extr_hi128_256( s5 ); \
__m128i s6hi = mm128_extr_hi128_256( s6 ); \
__m128i s7hi = mm128_extr_hi128_256( s7 ); \
casti_m256i( d,0 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
casti_m256i( d,4 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,0), mm128_extr_32(s6hi,0), \
mm128_extr_32(s5hi,0), mm128_extr_32(s4hi,0), \
mm128_extr_32(s3hi,0), mm128_extr_32(s2hi,0), \
mm128_extr_32(s1hi,0), mm128_extr_32(s0hi,0) ); \
casti_m256i( d,5 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,1), mm128_extr_32(s6hi,1), \
mm128_extr_32(s5hi,1), mm128_extr_32(s4hi,1), \
mm128_extr_32(s3hi,1), mm128_extr_32(s2hi,1), \
mm128_extr_32(s1hi,1), mm128_extr_32(s0hi,1) ); \
casti_m256i( d,6 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,2), mm128_extr_32(s6hi,2), \
mm128_extr_32(s5hi,2), mm128_extr_32(s4hi,2), \
mm128_extr_32(s3hi,2), mm128_extr_32(s2hi,2), \
mm128_extr_32(s1hi,2), mm128_extr_32(s0hi,2) ); \
casti_m256i( d,7 ) = _mm256_set_epi32( \
mm128_extr_32(s7hi,3), mm128_extr_32(s6hi,3), \
mm128_extr_32(s5hi,3), mm128_extr_32(s4hi,3), \
mm128_extr_32(s3hi,3), mm128_extr_32(s2hi,3), \
mm128_extr_32(s1hi,3), mm128_extr_32(s0hi,3) ); \
} while(0)
#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
{ \
casti_m256i( d,0 ) = _mm256_set_epi32( \
mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
casti_m256i( d,1 ) = _mm256_set_epi32( \
mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
casti_m256i( d,2 ) = _mm256_set_epi32( \
mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
casti_m256i( d,3 ) = _mm256_set_epi32( \
mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
} while(0)
#define mm256_bswap_intrlv_8x32_256( d, src ) \
do { \
@@ -153,128 +48,6 @@ do { \
casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
} while(0)
#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m256i s0 = casti_m256i(s,0); \
__m256i s1 = casti_m256i(s,1); \
__m256i s2 = casti_m256i(s,2); \
__m256i s3 = casti_m256i(s,3); \
__m256i s4 = casti_m256i(s,4); \
__m256i s5 = casti_m256i(s,5); \
__m256i s6 = casti_m256i(s,6); \
__m256i s7 = casti_m256i(s,7); \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
__m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
__m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
__m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
__m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
d0 = _mm256_set_epi32( \
extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
d1 = _mm256_set_epi32( \
extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
d2 = _mm256_set_epi32( \
extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
d3 = _mm256_set_epi32( \
extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
d4 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
_mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
_mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
_mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
d5 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
_mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
_mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
_mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
d6 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
_mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
_mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
_mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
d7 = _mm256_set_epi32( \
_mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
_mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
_mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
_mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
} while(0)
#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
do { \
__m128i s0 = casti_m128i(s,0); \
__m128i s1 = casti_m128i(s,1); \
__m128i s2 = casti_m128i(s,2); \
__m128i s3 = casti_m128i(s,3); \
d0 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d1 = _mm_set_epi32( \
_mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
d2 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d3 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d4 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d5 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d6 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
d7 = _mm_set_epi32( \
_mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
_mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
} while(0)
#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
do { \
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
casti_m256i( d,2 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
_mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
casti_m256i( d,3 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
_mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
} while(0)
#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
do { \
casti_m256i( d,0 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
_mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
casti_m256i( d,1 ) = _mm256_set_epi64x( \
_mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
_mm_extract_epi64( s1, 1 ), _mm_extract_epi64( s0, 1 ) ); \
} while(0)
#define mm256_bswap_intrlv_4x64_256( d, src ) \
do { \
__m256i s0 = mm256_bswap_32( src ); \
@@ -294,94 +67,6 @@ do { \
casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
} while(0)
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src ); // s[0][1:0]
__m256i s1 = *( (__m256i*)(src+32) ); // s[1][1:0]
__m256i s2 = *( (__m256i*)(src+64) ); // s[2][1:0]
__m256i s3 = *( (__m256i*)(src+96) ); // s[3][2:0]
__m128i s0hi = _mm256_extracti128_si256( s0, 1 ); // s[0][3:2]
__m128i s1hi = _mm256_extracti128_si256( s1, 1 ); // s[1][3:2]
__m128i s2hi = _mm256_extracti128_si256( s2, 1 ); // s[2][3:2]
__m128i s3hi = _mm256_extracti128_si256( s3, 1 ); // s[3][3:2]
casti_m256i( d0,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
casti_m256i( d1,n ) = _mm256_set_epi64x(
extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
casti_m256i( d2,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
_mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
casti_m256i( d3,n ) = _mm256_set_epi64x(
_mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
_mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
}
// quarter avx2 block, 16 bytes * 4 lanes
// 4 lanes of 128 bits using 64 bit interleaving
// Used for last 16 bytes of 80 byte input, only used for testing.
static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
void *d3, const int n, const void *src )
{
__m256i s0 = *( (__m256i*) src );
__m256i s1 = *( (__m256i*)(src+32) );
__m128i s0hi = _mm256_extracti128_si256( s0, 1 );
__m128i s1hi = _mm256_extracti128_si256( s1, 1 );
casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 0 ),
extr64_cast128_256( s0 , 0 ) );
casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1 , 1 ),
extr64_cast128_256( s0 , 1 ) );
casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 0 ),
_mm_extract_epi64( s0hi, 0 ) );
casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64( s1hi, 1 ),
_mm_extract_epi64( s0hi, 1 ) );
}
/*
static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
const int n, const void *s )
{
casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
}
*/
//
// Interleave 8 source buffers containing 32 bit data into the destination
// vector
static inline void mm256_intrlv_8x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
mm256_intrlv_8x32_256( d , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
if ( bit_len <= 256 ) return;
mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ),
casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
return;
}
mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ),
casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ),
casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
// bit_len == 1024
}
// A couple of mining specifi functions.
@@ -393,72 +78,6 @@ static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
}
// Deinterleave 8 buffers of 32 bit data from the source buffer.
// Sub-function can be called directly for 32 byte final hash.
static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
void *d3, void *d4, void *d5, void *d6, void *d7,
const void *s, int bit_len )
{
mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1),
casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2),
casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
return;
}
// bitlen == 1024
mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2),
casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3),
casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
}
static inline void mm256_extract_lane_8x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d,0 ) = mm256_get_32(s, lane , lane+ 8, lane+ 16, lane+ 24,
lane+32, lane+ 40, lane+ 48, lane+ 56 );
if ( bit_len <= 256 ) return;
casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
lane+96, lane+104, lane+112, lane+120 );
// bit_len == 512
}
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
static inline void mm256_intrlv_4x64( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_intrlv_4x64_256( d , casti_m256i(s0,0), casti_m256i(s1,0),
casti_m256i(s2,0), casti_m256i(s3,0) );
if ( bit_len <= 256 ) return;
mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
casti_m256i(s2,1), casti_m256i(s3,1) );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
casti_m128i(s2,4), casti_m128i(s3,4) );
return;
}
// bit_len == 1024
mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
casti_m256i(s2,2), casti_m256i(s3,2) );
mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
casti_m256i(s2,3), casti_m256i(s3,3) );
}
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
{
@@ -481,258 +100,5 @@ do { \
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
} while(0)
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
void *d3, const void *s, int bit_len )
{
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
if ( bit_len <= 256 ) return;
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
if ( bit_len <= 512 ) return;
// short block, final 16 bytes of input data
if ( bit_len <= 640 )
{
mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
return;
}
// bit_len == 1024
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
}
// extract and deinterleave specified lane.
#define mm256_extract_lane_4x64_256 \
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
static inline void mm256_extract_lane_4x64( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
if ( bit_len <= 256 ) return;
casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
return;
}
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
// bit_len == 1024
}
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
// bit_len == 1024
}
static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
const void *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
uint64_t *s = (uint64_t*)src;
d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
if ( bit_len <= 256 ) return;
d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
if ( bit_len <= 512 ) return;
d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
}
static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
if ( bit_len <= 256 ) return;
d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
if ( bit_len <= 512 ) return;
d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
}
static inline void mm256_intrlv_2x128( const void *d, const void *s0,
void *s1, const int bit_len )
{
__m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0), 1 );
__m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0), 1 );
casti_m256i( d,0 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,0 ) ),
_mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 256 ) return;
s0hi = _mm256_extracti128_si256( casti_m256i( s0,1), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,1), 1 );
casti_m256i( d,2 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,1 ) ),
_mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
return;
}
s0hi = _mm256_extracti128_si256( casti_m256i( s0,2), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,2), 1 );
casti_m256i( d,4 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,2 ) ),
_mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );
s0hi = _mm256_extracti128_si256( casti_m256i( s0,3), 1 );
s1hi = _mm256_extracti128_si256( casti_m256i( s1,3), 1 );
casti_m256i( d,6 ) = mm256_concat_128(
_mm256_castsi256_si128( casti_m256i( s1,3 ) ),
_mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );
}
static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
int bit_len )
{
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
__m256i s0 = casti_m256i( s, 0 );
__m256i s1 = casti_m256i( s, 1 );
d0[0] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[0] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
if ( bit_len <= 256 ) return;
s0 = casti_m256i( s, 2 );
s1 = casti_m256i( s, 3 );
d0[1] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[1] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
if ( bit_len <= 512 ) return;
s0 = casti_m256i( s, 4 );
s1 = casti_m256i( s, 5 );
d0[2] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[2] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
s0 = casti_m256i( s, 6 );
s1 = casti_m256i( s, 7 );
d0[3] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
d1[3] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
}
#undef extr64_cast128_256
#undef extr32_cast128_256
#endif // AVX2
#endif // INTRLV_AVX22_H__
#endif // INTRLV_AVX2_H__

View File

@@ -442,7 +442,7 @@ static inline void mm512_dintrlv_16x32( void *d00, void *d01, void *d02,
1, src+1024 );
}
static inline void mm512_extract_lane_16x32( void *dst, const void *src,
static inline void mm512_extr_lane_16x32( void *dst, const void *src,
const int lane, const int bit_len )
{
if ( bit_len <= 256 )
@@ -506,7 +506,7 @@ static inline void mm512_dintrlv_8x64( void *d0, void *d1, void *d2,
}
// Extract one lane from 64 bit interleaved data
static inline void mm512_extract_lane_8x64( void *d, const void *s,
static inline void mm512_extr_lane_8x64( void *d, const void *s,
const int lane, const int bit_len )
{
if ( bit_len <= 256 )
@@ -661,7 +661,7 @@ static inline void mm512_rintrlv_4x128_8x64( void *dst, const void *src0,
}
static inline void mm512_extract_lane_4x128( void *d, const void *s,
static inline void mm512_extr_lane_4x128( void *d, const void *s,
const int lane, const int bit_len )
{
int l = lane<<1;

View File

@@ -104,7 +104,7 @@ static inline void mm64_dintrlv_2x32( void *d00, void *d01, const int n,
casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
}
static inline void mm64_extract_lane_2x32( void *d, const void *s,
static inline void mm64_extr_lane_2x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m64( d, 0 ) = mm64_get_32( s, lane , lane+ 4 );

View File

@@ -0,0 +1,77 @@
#if !defined(INTRLV_SELECTOR_H__)
#define INTRLV_SELECTOR_H__
//////////////////////////////////////////////////////////////
//
// Generic interface for interleaving data for parallel processing.
//
// Best tech is chosen atomatically.
/*
#if defined(__AVX512F__)
#define intrlv_4x128 mm512_intrlv_4x128
#define intrlv_4x128 mm512_intrlv_4x128
#define intrlv_8x64 mm512_intrlv_8x64
#define dintrlv_8x64 mm512_dintrlv_8x64
#define extr_lane_8x64 mm512_extr_lane_8x64
#define intrlv_16x32 mm512_intrlv_16x32
#define dintrlv_16x32 mm512_dintrlv_16x32
#define extr_lane_16x32 mm512_extr_lane_16x32
#define intrlv_2x128 mm512_intrlv_2x128
#define dintrlv_2x128 mm512_dintrlv_2x128
#define intrlv_4x64 mm512_intrlv_4x64
#define dintrlv_4x64 mm512_dintrlv_4x64
#define extr_lane_4x64 mm512_extr_lane_4x64
#define intrlv_8x32 mm512_intrlv_8x32
#define dintrlv_8x32 mm512_dintrlv_8x32
#define extr_lane_8x32 mm512_extr_lane_8x32
#elif defined(__AVX__)
*/
#if defined(__AVX__)
#define intrlv_2x128 mm256_intrlv_2x128
#define dintrlv_2x128 mm256_dintrlv_2x128
#define intrlv_4x64 mm256_intrlv_4x64
#define dintrlv_4x64 mm256_dintrlv_4x64
#define extr_lane_4x64 mm256_extr_lane_4x64
#define intrlv_8x32 mm256_intrlv_8x32
#define dintrlv_8x32 mm256_dintrlv_8x32
#define extr_lane_8x32 mm256_extr_lane_8x32
#define intrlv_4x32 mm256_intrlv_4x32
#define dintrlv_4x32 mm256_dintrlv_4x32
#define extr_lane_4x32 mm256_extr_lane_4x32
#else
#define intrlv_2x128 mm128_intrlv_2x128
#define dintrlv_2x128 mm128_dintrlv_2x128
#define intrlv_4x64 mm128_intrlv_4x64
#define dintrlv_4x64 mm128_dintrlv_4x64
#define extr_lane_4x64 mm128_extr_lane_4x64
#define intrlv_8x32 mm128_intrlv_8x32
#define dintrlv_8x32 mm128_dintrlv_8x32
#define extr_lane_8x32 mm128_extr_lane_8x32
#define intrlv_2x64 mm128_intrlv_2x64
#define dintrlv_2x64 mm128_dintrlv_2x64
#define extr_lane_2x64 mm128_extr_lane_2x64
#define intrlv_4x32 mm128_intrlv_4x32
#define dintrlv_4x32 mm128_dintrlv_4x32
#define extr_lane_4x32 mm128_extr_lane_4x32
#endif
#endif // INTRLV_SELECTOR_H__

View File

@@ -162,8 +162,8 @@ static inline void mm128_dintrlv_4x32( void *d0, void *d1, void *d2,
}
// extract and deinterleave specified lane.
static inline void mm128_extract_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
static inline void mm128_extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m128i( d, 0 ) =
mm128_get_32( s, lane , lane+ 4, lane+ 8, lane+12 );

View File

@@ -1,5 +1,5 @@
#if !defined(SIMD_SSE2_H__)
#define SIMD_SSE2_H__ 1
#if !defined(SIMD_128_H__)
#define SIMD_128_H__ 1
#if defined(__SSE2__)
@@ -15,69 +15,148 @@
//
// 128 bit operations are enhanced with uint128 which adds 128 bit integer
// support for arithmetic and other operations. Casting to uint128_t is not
// free, it requires a move from mmx to gpr but is often the only way or
// the more efficient way for certain operations.
// Compile time constant initializers are type agnostic and can have
// a pointer handle of almost any type. All arguments must be scalar constants.
// up to 64 bits. These iniitializers should only be used at compile time
// to initialize vector arrays. All data reside in memory.
// efficient but is sometimes the only way for certain operations.
//
// Constants are an issue with simd. Simply put, immediate constants don't
// exist. All simd constants either reside in memory or a register.
// The distibction is made below with c128 being memory resident defined
// at compile time and m128 being register defined at run time.
//
// All run time constants must be generated using their components elements
// incurring significant overhead. The more elements the more overhead
// both in instructions and in GP register usage. Whenever possible use
// 64 bit constant elements regardless of the actual element size.
//
// Due to the cost of generating constants they should not be regenerated
// in the same function. Instead, define a local const.
//
// Some constant values can be generated using shortcuts. Zero for example
// is as simple as XORing any register with itself, and is implemented
// in the setzero instrinsic. These shortcuts must be implemented is asm
// due to doing things the compiler would complain about. Another single
// instruction constant is -1, defined below. Others may be added as the need
// arises. Even single instruction constants are less efficient than local
// register variables so the advice above stands.
//
// One common use for simd constants is as a control index for some simd
// instructions like blend and shuffle. The utilities below do not take this
// into account. Those that generate a simd constant should not be used
// repeatedly. It may be better for the application to reimplement the
// utility to better suit its usage.
//
// These are of limited use, it is often simpler to use uint64_t arrays
// and cast as required.
#define mm128_const_64( x1, x0 ) {{ x1, x0 }}
#define mm128_const1_64( x ) {{ x, x }}
#define mm128_const_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm128_const1_32( x ) {{ x,x,x,x }}
#define mm128_const_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
#define mm128_const1_16( x ) {{ x,x,x,x, x,x,x,x }}
#define mm128_const_8( x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm128_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
// Compile time constants, use only for compile time initializing.
#define c128_zero mm128_const1_64( 0ULL )
#define c128_one_128 mm128_const_64( 0ULL, 1ULL )
#define c128_one_64 mm128_const1_64( 1ULL )
#define c128_one_32 mm128_const1_32( 1UL )
#define c128_one_16 mm128_const1_16( 1U )
#define c128_one_8 mm128_const1_8( 1U )
#define c128_neg1 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c128_neg1_64 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c128_neg1_32 mm128_const1_32( 0xFFFFFFFFUL )
#define c128_neg1_16 mm128_const1_32( 0xFFFFU )
#define c128_neg1_8 mm128_const1_32( 0xFFU )
//
// Pseudo constants.
//
// These can't be used for compile time initialization.
// These should be used for all simple vectors.
//
// _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does.
// Clearly it's faster than reading a memory resident constant. Assume set
// is also faster.
// If a pseudo constant is used often in a function it may be preferable
// to define a register variable to represent that constant.
// register __m128i zero = mm_setzero_si128().
// This reduces any references to a move instruction.
// Repeated usage of any simd pseudo-constant should use a locally defined
// const rather than recomputing it for every reference.
#define m128_zero _mm_setzero_si128()
#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL )
#define m128_one_64 _mm_set1_epi64x( 1ULL )
#define m128_one_32 _mm_set1_epi32( 1UL )
#define m128_one_16 _mm_set1_epi16( 1U )
#define m128_one_8 _mm_set1_epi8( 1U )
// As suggested by Intel...
// Arg passing for simd registers is assumed to be first output arg,
// then input args, then locals. This is probably wrong, gcc likely picks
// whichever register is currently holding the variable, or whichever
// register is available to hold it. Nevertheless, all args are specified
// by their arg number and local variables use registers starting at
// last arg + 1, by type.
// Output args don't need to be listed as clobbered.
#define m128_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
static inline __m128i m128_one_64_fn()
{
__m128i a;
asm( "pxor %0, %0\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psubq %%xmm1, %0\n\t"
:"=x"(a)
:
: "xmm1" );
return a;
}
#define m128_one_64 m128_one_64_fn()
static inline __m128i m128_one_32_fn()
{
__m128i a;
asm( "pxor %0, %0\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psubd %%xmm1, %0\n\t"
:"=x"(a)
:
: "xmm1" );
return a;
}
#define m128_one_32 m128_one_32_fn()
static inline __m128i m128_one_16_fn()
{
__m128i a;
asm( "pxor %0, %0\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psubw %%xmm1, %0\n\t"
:"=x"(a)
:
: "xmm1" );
return a;
}
#define m128_one_16 m128_one_16_fn()
static inline __m128i m128_one_8_fn()
{
__m128i a;
asm( "pxor %0, %0\n\t"
"pcmpeqd %%xmm1, %%xmm1\n\t"
"psubb %%xmm1, %0\n\t"
:"=x"(a)
:
: "xmm1" );
return a;
}
#define m128_one_8 m128_one_8_fn()
static inline __m128i m128_neg1_fn()
{
__m128i a;
asm( "pcmpeqd %0, %0\n\t"
:"=x"(a) );
return a;
}
#define m128_neg1 m128_neg1_fn()
#if defined(__SSE41__)
static inline __m128i m128_one_128_fn()
{
__m128i a;
asm( "pinsrq $0, $1, %0\n\t"
"pinsrq $1, $0, %0\n\t"
:"=x"(a) );
return a;
}
#define m128_one_128 m128_one_128_fn()
// alternative to _mm_set_epi64x, doesn't use mem,
// cost = 2 pinsrt, estimate 4 clocks.
static inline __m128i m128_const_64( uint64_t hi, uint64_t lo )
{
__m128i a;
asm( "pinsrq $0, %2, %0\n\t"
"pinsrq $1, %1, %0\n\t"
:"=x"(a)
:"r"(hi),"r"(lo) );
return a;
}
#else
#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL )
#define m128_const_64 _mm_set_epi64x
#endif
//
// Basic operations without equivalent SIMD intrinsic
@@ -90,9 +169,21 @@
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
// Use uint128_t for most arithmetic, bit shift, comparison operations
// spanning all 128 bits. Some extractions are also more efficient
// casting __m128i as uint128_t and usingstandard operators.
// Add 4 values, fewer dependencies than sequential addition.
#define mm128_add4_64( a, b, c, d ) \
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
#define mm128_add4_32( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
#define mm128_add4_16( a, b, c, d ) \
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
#define mm128_add4_8( a, b, c, d ) \
_mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )
#define mm128_xor4( a, b, c, d ) \
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
// This isn't cheap, not suitable for bulk usage.
#define mm128_extr_4x32( a0, a1, a2, a3, src ) \
@@ -105,6 +196,16 @@ do { \
// Horizontal vector testing
#if defined(__SSE41__)
#define mm128_allbits0( a ) _mm_testz_si128( a, a )
#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 )
#define mm128_allbitsne( a ) _mm_testnzc_si128( a, m128_neg1 )
#define mm128_anybits0 mm128_allbitsne
#define mm128_anybits1 mm128_allbitsne
#else // SSE2
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm128_anybits0( a ) (uint128_t)(a)
#define mm128_anybits1( a ) (((uint128_t)(a))+1)
@@ -112,6 +213,8 @@ do { \
#define mm128_allbits0( a ) ( !mm128_anybits1(a) )
#define mm128_allbits1( a ) ( !mm128_anybits0(a) )
#endif // SSE41 else SSE2
//
// Vector pointer cast
@@ -139,6 +242,7 @@ do { \
#else
// Doesn't work with register variables.
#define mm128_extr_64(a,n) (((uint64_t*)&a)[n])
#define mm128_extr_32(a,n) (((uint32_t*)&a)[n])
@@ -209,7 +313,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
// Bit rotations
// AVX512 has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements. Not really useful.
// 64 and 32 bit elements.
//
// Rotate each element of v by c bits
@@ -233,13 +337,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
//
// Rotate elements accross all lanes
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#if defined (__SSE3__)
// no SSE2 implementation, no current users
#define mm128_ror_1x16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0,15,14,13,12,11,10 \
9, 8, 7, 6, 5, 4, 3, 2 ) )
@@ -252,6 +359,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
#define mm128_rol_1x8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0,15 ) )
#endif // SSE3
// Rotate 16 byte (128 bit) vector by c bytes.
// Less efficient using shift but more versatile. Use only for odd number
@@ -262,17 +370,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
#define mm128_brol( v, c ) \
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm128_invert_32( v ) _mm_shuffle_epi32( a, 0x1b )
#define mm128_invert_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, \
9, 8, 11,10, 13,12, 15,14 ) )
#define mm128_invert_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15 ) )
//
// Rotate elements within lanes.
@@ -283,7 +380,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \
_mm_set_epi8( 13,12,11,10, 9, 8,15,14, 5, 4, 3, 2, 1, 0, 7, 6 )
#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \
_mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
@@ -293,17 +389,45 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
#if defined(__SSSE3__)
#define mm128_bswap_64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7 ) )
_mm_shuffle_epi8( v, m128_const64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define mm128_bswap_32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3 ) )
_mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define mm128_bswap_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
6, 7, 4, 5, 2, 3, 0, 1 ) )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) do \
{ \
__m128i ctl = m128_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
} while(0)
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define mm128_block_bswap_32( d, s ) do \
{ \
__m128i ctl = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
} while(0)
#else // SSE2
// Use inline function instead of macro due to multiple statements.
@@ -326,16 +450,41 @@ static inline __m128i mm128_bswap_16( __m128i v )
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
static inline void mm128_block_bswap_64( __m128i *d, __m128i *s )
{
d[0] = mm128_bswap_32( s[0] );
d[1] = mm128_bswap_32( s[1] );
d[2] = mm128_bswap_32( s[2] );
d[3] = mm128_bswap_32( s[3] );
d[4] = mm128_bswap_32( s[4] );
d[5] = mm128_bswap_32( s[5] );
d[6] = mm128_bswap_32( s[6] );
d[7] = mm128_bswap_32( s[7] );
}
static inline void mm128_block_bswap_32( __m128i *d, __m128i *s )
{
d[0] = mm128_bswap_32( s[0] );
d[1] = mm128_bswap_32( s[1] );
d[2] = mm128_bswap_32( s[2] );
d[3] = mm128_bswap_32( s[3] );
d[4] = mm128_bswap_32( s[4] );
d[5] = mm128_bswap_32( s[5] );
d[6] = mm128_bswap_32( s[6] );
d[7] = mm128_bswap_32( s[7] );
}
#endif // SSSE3 else SSE2
//
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
// Swap 128 bit vectorse.
#define mm128_swap128_256(v1, v2) \
v1 = _mm_xor_si128(v1, v2); \
v2 = _mm_xor_si128(v1, v2); \
v1 = _mm_xor_si128(v1, v2);
#define mm128_swap128_256( v1, v2 ) \
v1 = _mm_xor_si128( v1, v2 ); \
v2 = _mm_xor_si128( v1, v2 ); \
v1 = _mm_xor_si128( v1, v2 );
// Concatenate v1 & v2 and rotate as one 256 bit vector.
#if defined(__SSE4_1__)
@@ -457,4 +606,4 @@ do { \
#endif // SSE4.1 else SSE2
#endif // __SSE2__
#endif // SIMD_SSE2_H__
#endif // SIMD_128_H__

View File

@@ -1,99 +1,134 @@
#if !defined(SIMD_AVX2_H__)
#define SIMD_AVX2_H__ 1
#if !defined(SIMD_256_H__)
#define SIMD_256_H__ 1
#if defined(__AVX2__)
#if defined(__AVX__)
/////////////////////////////////////////////////////////////////////
//
// AVX2 256 bit vectors
//
// AVX2 is required for integer support of 256 bit vectors.
// Basic support for 256 bit vectors is available with AVX but integer
// support requires AVX2.
// Some 256 bit vector utilities require AVX512 or have more efficient
// AVX512 implementations. They will be selected automatically but their use
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Vector type overlays used by compile time vector constants.
// Constants of these types reside in memory.
// Compile time vector constants and initializers.
//
// The following macro constants and functions should only be used
// for compile time initialization of constant and variable vector
// arrays. These constants use memory, use _mm256_set at run time to
// avoid using memory.
#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm256_const1_64( x ) {{ x,x,x,x }}
#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 ) \
{{ x31, x30, x29, x28, x27, x26, x25, x24, \
x23, x22, x21, x20, x19, x18, x17, x16, \
x15, x14, x13, x12, x11, x10, x09, x08, \
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
// Predefined compile time constant vectors.
// Use Pseudo constants at run time for all simple constant vectors.
#define c256_zero mm256_const1_64( 0ULL )
#define c256_one_256 mm256_const_64( 0ULL, 0ULL, 0ULL, 1ULL )
#define c256_one_128 mm256_const_64( 0ULL, 1ULL, 0ULL, 1ULL )
#define c256_one_64 mm256_const1_64( 1ULL )
#define c256_one_32 mm256_const1_32( 1UL )
#define c256_one_16 mm256_const1_16( 1U )
#define c256_one_8 mm256_const1_8( 1U )
#define c256_neg1 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_64 mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c256_neg1_32 mm256_const1_32( 0xFFFFFFFFUL )
#define c256_neg1_16 mm256_const1_16( 0xFFFFU )
#define c256_neg1_8 mm256_const1_8( 0xFFU )
//
// Pseudo constants.
// These can't be used for compile time initialization but are preferable
// for simple constant vectors at run time.
// for simple constant vectors at run time. For repeated use define a local
// constant to avoid multiple calls to the same macro.
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi32( 1UL )
#define m256_one_16 _mm256_set1_epi16( 1U )
#define m256_one_8 _mm256_set1_epi8( 1U )
#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
#define m256_zero _mm256_setzero_si256()
//
// Basic operations without SIMD equivalent
#define m256_one_256 \
_mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_zero, 1 )
// Bitwise not ( ~x )
#define mm256_not( x ) _mm256_xor_si256( (x), m256_neg1 ) \
#define m256_one_128 \
_mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_one_128, 1 )
// Unary negation of each element ( -a )
#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a )
#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
// set instructions load memory resident constants, this avoids mem.
// cost 4 pinsert + 1 vinsert, estimate 7 clocks.
#define m256_const_64( i3, i2, i1, i0 ) \
_mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \
m128_const_64( i3, i2 ), 1 )
#define m256_const1_64( i ) m256_const_64( i, i, i, i )
#if defined(__AVX2__)
// These look like a lot of overhead but the compiler optimizes nicely
// and puts the asm inline in the calling function. Usage is like any
// variable expression.
// __m256i foo = m256_one_64;
static inline __m256i m256_one_64_fn()
{
__m256i a;
asm( "vpxor %0, %0, %0\n\t"
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
"vpsubq %%ymm1, %0, %0\n\t"
:"=x"(a)
:
: "ymm1" );
return a;
}
#define m256_one_64 m256_one_64_fn()
static inline __m256i m256_one_32_fn()
{
__m256i a;
asm( "vpxor %0, %0, %0\n\t"
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
"vpsubd %%ymm1, %0, %0\n\t"
:"=x"(a)
:
: "ymm1" );
return a;
}
#define m256_one_32 m256_one_32_fn()
static inline __m256i m256_one_16_fn()
{
__m256i a;
asm( "vpxor %0, %0, %0\n\t"
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
"vpsubw %%ymm1, %0, %0\n\t"
:"=x"(a)
:
: "ymm1" );
return a;
}
#define m256_one_16 m256_one_16_fn()
static inline __m256i m256_one_8_fn()
{
__m256i a;
asm( "vpxor %0, %0, %0\n\t"
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
"vpsubb %%ymm1, %0, %0\n\t"
:"=x"(a)
:
: "ymm1" );
return a;
}
#define m256_one_8 m256_one_8_fn()
static inline __m256i m256_neg1_fn()
{
__m256i a;
asm( "vpcmpeqq %0, %0, %0\n\t"
:"=x"(a) );
return a;
}
#define m256_neg1 m256_neg1_fn()
#else // AVX
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi64x( 0x0000000100000001ULL )
#define m256_one_16 _mm256_set1_epi64x( 0x0001000100010001ULL )
#define m256_one_8 _mm256_set1_epi64x( 0x0101010101010101ULL )
// AVX doesn't have inserti128 but insertf128 will do.
// Ideally this can be done with 2 instructions and no temporary variables.
static inline __m256i m256_neg1_fn()
{
__m128i a = m128_neg1;
return _mm256_insertf128_si256( _mm256_castsi128_si256( a ), a, 1 );
}
#define m256_neg1 m256_neg1_fn()
//#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
#endif // AVX2 else AVX
//
// Vector size conversion.
//
// Allows operations on either or both halves of a 256 bit vector serially.
// Handy for parallel AES.
// Caveats:
// Caveats when writing:
// _mm256_castsi256_si128 is free and without side effects.
// _mm256_castsi128_si256 is also free but leaves the high half
// undefined. That's ok if the hi half will be subseqnently assigned.
@@ -134,11 +169,21 @@ do { \
#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 )
#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 )
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
// Horizontal vector testing
#if defined(__AVX2__)
#define mm256_allbits0( a ) _mm256_testz_si256( a, a )
#define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 )
#define mm256_allbitsne( a ) _mm256_testnzc_si256( a, m256_neg1 )
#define mm256_anybits0 mm256_allbitsne
#define mm256_anybits1 mm256_allbitsne
#else // AVX
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm256_anybits0( a ) \
@@ -152,35 +197,20 @@ do { \
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
#endif // AVX2 else AVX
// Parallel AES, for when x is expected to be in a 256 bit register.
#define mm256_aesenc_2x128( x ) \
mm256_concat_128( \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), m128_zero ), \
_mm_aesenc_si128( mm128_extr_lo128_256( x ), m128_zero ) )
// Use same 128 bit key.
#define mm256_aesenc_2x128( x, k ) \
mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
_mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
#define mm256_aesenckey_2x128( x, k ) \
mm256_concat_128( \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
mm128_extr_lo128_256( k ) ), \
_mm_aesenc_si128( mm128_extr_hi128_256( x ), \
mm128_extr_lo128_256( k ) ) )
#define mm256_paesenc_2x128( y, x ) do \
#define mm256_paesenc_2x128( y, x, k ) do \
{ \
__m256i *X = (__m256i*)x; \
__m256i *Y = (__m256i*)y; \
y[0] = _mm_aesenc_si128( x[0], m128_zero ); \
y[1] = _mm_aesenc_si128( x[1], m128_zero ); \
} while(0);
// With pointers.
#define mm256_paesenckey_2x128( y, x, k ) do \
{ \
__m256i *X = (__m256i*)x; \
__m256i *Y = (__m256i*)y; \
__m256i *K = (__m256i*)ky; \
y[0] = _mm_aesenc_si128( x[0], K[0] ); \
y[1] = _mm_aesenc_si128( x[1], K[1] ); \
__m128i *X = (__m128i*)x; \
__m128i *Y = (__m128i*)y; \
Y[0] = _mm_aesenc_si128( X[0], k ); \
Y[1] = _mm_aesenc_si128( X[1], k ); \
} while(0);
//
@@ -254,6 +284,42 @@ static inline void memset_256( __m256i *dst, const __m256i a, int n )
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
///////////////////////////////
//
// AVX2 needed from now on.
//
#if defined(__AVX2__)
//
// Basic operations without SIMD equivalent
// Bitwise not ( ~x )
#define mm256_not( x ) _mm256_xor_si256( (x), m256_neg1 ) \
// Unary negation of each element ( -a )
#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a )
#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
// Add 4 values, fewer dependencies than sequential addition.
#define mm256_add4_64( a, b, c, d ) \
_mm256_add_epi64( _mm256_add_epi64( a, b ), _mm256_add_epi64( c, d ) )
#define mm256_add4_32( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
#define mm256_add4_16( a, b, c, d ) \
_mm256_add_epi16( _mm256_add_epi16( a, b ), _mm256_add_epi16( c, d ) )
#define mm256_add4_8( a, b, c, d ) \
_mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
//
// Bit rotations.
//
@@ -292,24 +358,27 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
// index vector c
#define mm256_rorv_64( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi64( v, _mm256_set1_epi64x( c ) ), \
_mm256_sllv_epi64( v, _mm256_set1_epi64x( 64-(c) ) ) )
_mm256_srlv_epi64( v, c ), \
_mm256_sllv_epi64( v, _mm256_sub_epi64( \
_mm256_set1_epi64x( 64 ), c ) ) )
#define mm256_rolv_64( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi64( v, _mm256_set1_epi64x( c ) ), \
_mm256_srlv_epi64( v, _mm256_set1_epi64x( 64-(c) ) ) )
_mm256_sllv_epi64( v, c ), \
_mm256_srlv_epi64( v, _mm256_sub_epi64( \
_mm256_set1_epi64x( 64 ), c ) ) )
#define mm256_rorv_32( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi32( v, _mm256_set1_epi32( c ) ), \
_mm256_sllv_epi32( v, _mm256_set1_epi32( 32-(c) ) ) )
_mm256_srlv_epi32( v, c ), \
_mm256_sllv_epi32( v, _mm256_sub_epi32( \
_mm256_set1_epi32( 32 ), c ) ) )
#define mm256_rolv_32( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi32( v, _mm256_set1_epi32( c ) ), \
_mm256_srlv_epi32( v, _mm256_set1_epi32( 32-(c) ) ) )
_mm256_sllv_epi32( v, c ), \
_mm256_srlv_epi32( v, _mm256_sub_epi32( \
_mm256_set1_epi32( 32 ), c ) ) )
// AVX512 can do 16 bit elements.
@@ -326,17 +395,28 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
// A little faster with avx512
// Rotate 256 bit vector by one 32 bit element. Use 64 bit set, it's faster.
#define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5, 4,3,2,1 ) )
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 )
#define mm256_rol_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3, 2,1,0,7 ) )
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 )
// Rotate 256 bit vector by three 32 bit elements (96 bits).
#define mm256_ror_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7, 6,5,4,3 ) )
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000200000001, 0x0000000000000007, \
0x0000000600000005, 0x0000000400000003 )
#define mm256_rol_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1, 0,7,6,5 ) )
_mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000400000003, 0x0000000200000001, \
0x0000000000000007, 0x0000000600000005 )
// AVX512 can do 16 & 8 bit elements.
#if defined(__AVX512VL__)
@@ -344,7 +424,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
// Rotate 256 bit vector by one 16 bit element.
#define mm256_ror_1x16( v ) \
_mm256_permutexvar_epi16( _mm256_set_epi16( \
0,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
0,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
#define mm256_rol_1x16( v ) \
_mm256_permutexvar_epi16( _mm256_set_epi16( \
@@ -354,7 +434,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
#define mm256_ror_1x8( v ) \
_mm256_permutexvar_epi8( _mm256_set_epi8( \
0,31,30,29,28,27,26,25, 24,23,22,21,20,19,18,17, \
16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
#define mm256_rol_1x8( v ) \
_mm256_permutexvar_epi8( _mm256_set_epi8( \
@@ -363,14 +443,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
#endif // AVX512
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm256_invert_64( v ) _mm256_permute4x64_epi64( a, 0x1b )
#define mm256_invert_32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,1,2,3,4,5,6,7 ) )
// AVX512 can do 16 & 8 bit elements.
//
// Rotate elements within lanes of 256 bit vector.
@@ -383,15 +455,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
// Rotate each 128 bit lane by one 16 bit element.
#define mm256_rol1x16_128( v ) \
_mm256_shuffle_epi8( 13,12,11,10, 9,8,7,6, 5,4,3,2, 1,0,15,14 )
_mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,3,2,1,0,7, \
6,5,4,3,2,1,0,7 ) )
#define mm256_ror1x16_128( v ) \
_mm256_shuffle_epi8( 1,0,15,14, 13,12,11,10, 9,8,7,6, 5,4,3,2 )
_mm256_shuffle_epi8( v, _mm256_set_epi16( 0,7,6,5,4,3,2,1, \
0,7,6,5,4,3,2,1 ) )
// Rotate each 128 bit lane by one byte
#define mm256_rol1x8_128( v ) \
_mm256_shuffle_epi8( 14, 13,12,11, 10,9,8,7, 6,5,4,3, 2,1,0,15 )
_mm256_shuffle_epi8( v, _mm256_set_epi8(14,13,12,11,10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0,15, \
14,13,12,11,10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0,15 ) )
#define mm256_ror1x8_128( v ) \
_mm256_shuffle_epi8( 0,15,14,13, 12,11,10,9, 8,7,6,5, 4,3,2,1 )
_mm256_shuffle_epi8( v, _mm256_set_epi8( 0,15,14,13,12,11,10, 9, \
8, 7, 6, 5, 4, 3, 2, 1, \
0,15,14,13,12,11,10, 9, \
8, 7, 6, 5, 4, 3, 2, 1 ) )
// Rotate each 128 bit lane by c bytes.
#define mm256_bror_128( v, c ) \
@@ -405,28 +485,27 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_ror16_64( v ) \
_mm256_shuffle_epi8( 9, 8,15,14,13,12,11,10, 1, 0, 7, 6, 5, 4, 3, 2 );
_mm256_shuffle_epi8( v, _mm256_set_epi16( 4,7,6,5,0,3,2,1, \
4,7,6,5,0,3,2,1 ) )
#define mm256_rol16_64( v ) \
_mm256_shuffle_epi8( 13,12,11,10, 9, 8,15,14, 5, 4, 3, 2, 1, 0, 7, 6 );
_mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,7,2,1,0,3, \
6,5,4,7,2,1,0,3 ) )
// Swap 16 bit elements in each 32 bit lane
#define mm256_swap16_32( v ) _mm256_shuffle_epi8( v, \
_mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
#define mm256_swap16_32( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi16( 6,7,4,5,2,3,0,1, \
6,7,4,5,2,3,0,1 ) )
//
// Swap bytes in vector elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7 ) )
_mm256_shuffle_epi8( v, m256_const_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
#define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3, \
12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3 ) )
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
#define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
@@ -434,6 +513,36 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
14,15, 12,13, 10,11, 8, 9, \
6, 7, 4, 5, 2, 3, 0, 1 ) )
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
#define mm256_block_bswap_64( d, s ) do \
{ \
__m256i ctl = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
} while(0)
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
#define mm256_block_bswap_32( d, s ) do \
{ \
__m256i ctl = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
} while(0)
//
// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
// number of elements. Rotate is done in place, source arguments are
@@ -517,5 +626,6 @@ do { \
} while(0)
#endif // __AVX2__
#endif // SIMD_AVX2_H__
#endif // __AVX__
#endif // SIMD_256_H__

View File

@@ -1,5 +1,5 @@
#if !defined(SIMD_AVX512_H__)
#define SIMD_AVX512_H__ 1
#if !defined(SIMD_512_H__)
#define SIMD_512_H__ 1
#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -246,28 +246,22 @@
//
// Rotate elements in 512 bit vector.
#define mm512_swap_256( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64( 3,2,1,0, 7,6,5,4 ) )
#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 )
#define mm512_ror_1x128( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64( 1,0, 7,6, 5,4, 3,2 ) )
#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 )
#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 )
#define mm512_rol_1x128( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64( 5,4, 3,2, 1,0, 7,6 ) )
#define mm512_ror_1x64( v ) _mm512_alignr_epi64( v, v, 1 )
#define mm512_rol_1x64( v ) _mm512_alignr_epi64( v, v, 7 )
#define mm512_ror_1x64( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64( 0,7,6,5,4,3,2,1 ) )
#define mm512_ror_1x32( v ) _mm512_alignr_epi32( v, v, 1 )
#define mm512_rol_1x32( v ) _mm512_alignr_epi32( v, v, 15 )
#define mm512_rol_1x64( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64( 6,5,4,3,2,1,0,7 ) )
// Generic for odd rotations
#define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n )
#define mm512_ror_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
0,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ) )
#define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n )
#define mm512_rol_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 ) )
// Although documented to exist in AVX512F the _mm512_set_epi8 &
// _mm512_set_epi16 intrinsics fail to compile. Seems usefull to have
@@ -282,7 +276,7 @@
0X00080007, 0X00060005, 0X00040003, 0X00020001 ) )
#define mm512_rol_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
_mm512_permutexvar_epi16( v, _mm512_set_epi32( \
0x001E001D, 0x001C001B, 0x001A0019, 0x00180017, \
0X00160015, 0X00140013, 0X00120011, 0x0010000F, \
0X000E000D, 0X000C000B, 0X000A0009, 0X00080007, \
@@ -290,14 +284,14 @@
#define mm512_ror_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
_mm512_permutexvar_epi8( v, _mm512_set_epi32( \
0x003F3E3D, 0x3C3B3A39, 0x38373635, 0x34333231, \
0x302F2E2D, 0x2C2B2A29, 0x28272625, 0x24232221, \
0x201F1E1D, 0x1C1B1A19. 0x18171615, 0x14131211, \
0x100F0E0D, 0x0C0B0A09, 0x08070605, 0x04030201 ) )
#define mm512_rol_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
_mm512_permutexvar_epi8( v, _mm512_set_epi32( \
0x3E3D3C3B, 0x3A393837, 0x36353433, 0x3231302F. \
0x2E2D2C2B, 0x2A292827, 0x26252423, 0x2221201F, \
0x1E1D1C1B, 0x1A191817, 0x16151413, 0x1211100F, \
@@ -601,4 +595,4 @@ do { \
} while(0)
#endif // AVX512
#endif // SIMD_AVX512_H__
#endif // SIMD_512_H__

View File

@@ -1,5 +1,5 @@
#if !defined(SIMD_MMX_H__)
#define SIMD_MMX_H__ 1
#if !defined(SIMD_64_H__)
#define SIMD_64_H__ 1
#if defined(__MMX__)
@@ -13,21 +13,20 @@
// Pseudo constants
/*
#define m64_zero _mm_setzero_si64()
#define m64_one_64 _mm_set_pi32( 0UL, 1UL )
#define m64_one_32 _mm_set1_pi32( 1UL )
#define m64_one_16 _mm_set1_pi16( 1U )
#define m64_one_8 _mm_set1_pi8( 1U );
#define m64_neg1 _mm_set1_pi32( 0xFFFFFFFFUL )
/* cast also works, which is better?
*/
#define m64_zero ( (__m64)0ULL )
#define m64_one_64 ( (__m64)1ULL )
#define m64_one_32 ( (__m64)0x0000000100000001ULL )
#define m64_one_16 ( (__m64)0x0001000100010001ULL )
#define m64_one_8 ( (__m64)0x0101010101010101ULL )
#define m64_neg1 ( (__m64)0xFFFFFFFFFFFFFFFFULL )
*/
#define casti_m64(p,i) (((__m64*)(p))[(i)])
@@ -42,6 +41,14 @@
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, (__m64)v )
// Rotate bits in packed elements of 64 bit vector
#define mm64_rol_64( a, n ) \
_mm_or_si64( _mm_slli_si64( (__m64)(a), n ), \
_mm_srli_si64( (__m64)(a), 64-(n) ) )
#define mm64_ror_64( a, n ) \
_mm_or_si64( _mm_srli_si64( (__m64)(a), n ), \
_mm_slli_si64( (__m64)(a), 64-(n) ) )
#define mm64_rol_32( a, n ) \
_mm_or_si64( _mm_slli_pi32( (__m64)(a), n ), \
_mm_srli_pi32( (__m64)(a), 32-(n) ) )
@@ -78,22 +85,20 @@
// Endian byte swap packed elements
// A vectorized version of the u64 bswap, use when data already in MMX reg.
#define mm64_bswap_64( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) )
_mm_shuffle_pi8( (__m64)v, (__m64)0x0001020304050607 )
#define mm64_bswap_32( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 4,5,6,7, 0,1,2,3 ) )
_mm_shuffle_pi8( (__m64)v, (__m64)0x0405060700010203 )
/*
#define mm64_bswap_16( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 6,7, 4,5, 2,3, 0,1 ) );
*/
_mm_shuffle_pi8( (__m64)v, (__m64)0x0607040502030001 );
#else
#define mm64_bswap_64( v ) \
(__m64)__builtin_bswap64( (uint64_t)v )
// This exists only for compatibility with CPUs without SSSE3. MMX doesn't
// These exist only for compatibility with CPUs without SSSE3. MMX doesn't
// have extract 32 instruction so pointers are needed to access elements.
// It' more efficient for the caller to use scalar variables and call
// bswap_32 directly.
@@ -101,20 +106,11 @@
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
#endif
// Invert vector: {3,2,1,0} -> {0,1,2,3}
// Invert_64 is the same as bswap64
// Invert_32 is the same as swap32
#define mm64_invert_16( v ) _mm_shuffle_pi16( (__m64)v, 0x1b )
#if defined(__SSSE3__)
// An SSE2 or MMX version of this would be monstrous, shifting, masking and
// oring each byte individually.
#define mm64_invert_8( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) );
#define mm64_bswap_16( v ) \
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
#endif
@@ -131,5 +127,5 @@ static inline void memset_m64( __m64 *dst, const __m64 a, int n )
#endif // MMX
#endif // SIMD_MMX_H__
#endif // SIMD_64_H__

View File

@@ -62,10 +62,16 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n )
//
// 128 bit integers
//
// 128 bit integers are inneficient and not a shortcut for __m128i.
// No real need or use.
//#define u128_neg1 ((uint128_t)(-1))
// usefull for making constants.
#define mk_uint128( hi, lo ) \
( ( (uint128_t)(hi) << 64 ) | ( (uint128_t)(lo) ) )
// Extracting the low bits is a trivial cast.
// These specialized functions are optimized while providing a
// consistent interface.

View File

@@ -14,11 +14,13 @@
#ifndef WIN32
#define HWMON_PATH \
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
"/sys/class/hwmon/hwmon2/temp1_input"
#define HWMON_ALT \
"/sys/class/hwmon/hwmon1/temp1_input"
#define HWMON_ALT2 \
"/sys/class/hwmon/hwmon0/temp1_input"
#define HWMON_ALT1 \
"/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
#define HWMON_ALT2 \
"/sys/class/hwmon/hwmon1/temp1_input"
#define HWMON_ALT3 \
"/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input"
#define HWMON_ALT4 \

5
util.c
View File

@@ -1631,7 +1631,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
hashrate += thr_hashrates[i];
pthread_mutex_unlock(&stats_lock);
double diff = trunc( ( ((double)0xffffffff) / target ) );
if ( opt_showdiff )
if ( !opt_quiet )
// xmr pool diff can change a lot...
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
stratum_diff = diff;
@@ -1813,7 +1813,8 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
/* store for api stats */
stratum_diff = diff;
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
if ( !opt_quiet )
applog(LOG_BLUE, "Stratum difficulty set to %g", diff);
return true;
}